# Importing necessary libraries and packages

In [2]:
import pandas as pd

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.probability import FreqDist

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim
import string

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/flatironschool/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/flatironschool/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/flatironschool/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Cleaning members text data

In [3]:
members=pd.read_csv('members_cleaned1.csv')

In [4]:
members.head()

Unnamed: 0,member_id,bio,city,joined,state,member_status,visited,group_id
0,3,not_found,New York,2007-05-01 22:04:37,NY,active,2009-09-18 18:32:23,490552
1,3,not_found,New York,2011-01-23 14:13:17,NY,active,2011-03-20 01:02:11,1474611
2,3,"Hi, I'm Matt. I'm an entrepreneur who has star...",New York,2010-12-30 18:47:34,NY,active,2011-01-18 20:37:23,1490492
3,3,"Hi, I'm Matt. I'm an entrepreneur who has star...",New York,2011-01-03 14:45:21,NY,active,2011-07-23 03:42:28,1515830
4,3,"Hi, I'm Matt. I'm an entrepreneur who has star...",New York,2010-12-30 18:34:50,NY,active,2011-06-13 18:33:23,1574965


As we see the database contains members and groups they are belonged to. That's why the member id is not unique.
So we need to have a database with unique members id, in this way we will have the unique bio of each member, which can be further processed and used for train the model.
For further investigation we need to have just the columns with members id and their description.

In [5]:
col=['member_id', 'bio']
mem_bio=members[col]

In [6]:
mem_bio.head()

Unnamed: 0,member_id,bio
0,3,not_found
1,3,not_found
2,3,"Hi, I'm Matt. I'm an entrepreneur who has star..."
3,3,"Hi, I'm Matt. I'm an entrepreneur who has star..."
4,3,"Hi, I'm Matt. I'm an entrepreneur who has star..."


In [7]:
bio=mem_bio['bio']

So, how to choose which unique id of a member to leave and which one to delete. We will count the words in the bio, and will keep each id with the bigger amount of words in the bio.

In [8]:
text = []

for clue in bio:
    sentence = clue.translate(str.maketrans('', '',string.punctuation)).split(' ')
    
    new_sent = []
    for word in sentence:
        new_sent.append(word.lower())
    
    text.append(new_sent)

In [9]:
text[0]

['notfound']

In [10]:
count_lists=[]
for lst in text:
    count_lists.append(len(lst)) 

In [11]:
count_df=pd.DataFrame(count_lists)

In [12]:
members_counts=pd.merge(left=mem_bio, right=count_df, left_index=True, right_index=True)

In [13]:
members_counts=members_counts.rename(columns={0:'count'})

In [14]:
members_counts.head()

Unnamed: 0,member_id,bio,count
0,3,not_found,1
1,3,not_found,1
2,3,"Hi, I'm Matt. I'm an entrepreneur who has star...",31
3,3,"Hi, I'm Matt. I'm an entrepreneur who has star...",31
4,3,"Hi, I'm Matt. I'm an entrepreneur who has star...",31


In [15]:
max_count=members_counts.groupby('member_id').max()

In [16]:
max_count.head()

Unnamed: 0_level_0,bio,count
member_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,not_found,41
6,part of the ny tech community for almost 20yrs...,38
36,not_found,1
65,on the Go team at Google. work on Camlistore n...,13
82,not_found,11


In [17]:
filtered_bio=pd.merge(left=members_counts, right=max_count, how='inner', on=['member_id', 'count'])

In [18]:
filtered_bio.head()

Unnamed: 0,member_id,bio_x,count,bio_y
0,3,"Hi, I'm Matt. I'm an entrepreneur who has star...",41,not_found
1,6,i was hoping for a meetup like this. i studied...,38,part of the ny tech community for almost 20yrs...
2,36,not_found,1,not_found
3,36,not_found,1,not_found
4,36,not_found,1,not_found


In [19]:
filtered_bio=filtered_bio.drop_duplicates(subset='member_id',keep='first')

In [20]:
filtered_bio.head()

Unnamed: 0,member_id,bio_x,count,bio_y
0,3,"Hi, I'm Matt. I'm an entrepreneur who has star...",41,not_found
1,6,i was hoping for a meetup like this. i studied...,38,part of the ny tech community for almost 20yrs...
2,36,not_found,1,not_found
5,65,on the Go team at Google. work on Camlistore n...,13,on the Go team at Google. work on Camlistore n...
6,82,I write code for a living and occasionally dab...,11,not_found


In [21]:
filtered_bio.loc[filtered_bio['count']==0]

Unnamed: 0,member_id,bio_x,count,bio_y


In [22]:
col2=['member_id', 'bio_x']
unique_id=filtered_bio[col2]

In [23]:
unique_id.head()

Unnamed: 0,member_id,bio_x
0,3,"Hi, I'm Matt. I'm an entrepreneur who has star..."
1,6,i was hoping for a meetup like this. i studied...
2,36,not_found
5,65,on the Go team at Google. work on Camlistore n...
6,82,I write code for a living and occasionally dab...


In [24]:
unique_id=unique_id.reset_index()

In [25]:
unique_id.head()

Unnamed: 0,index,member_id,bio_x
0,0,3,"Hi, I'm Matt. I'm an entrepreneur who has star..."
1,1,6,i was hoping for a meetup like this. i studied...
2,2,36,not_found
3,5,65,on the Go team at Google. work on Camlistore n...
4,6,82,I write code for a living and occasionally dab...


In [None]:
unique_id['member_id'].value_counts(dropna=False).head()

In [27]:
# unique_id.to_csv('unique_members-id.csv', index=False)

Now, as we have the unique id of the members, we will merge 2 databases, the database that contains the members id and their full description and database with tags based on members bio.

In [28]:
df_lem_tags=pd.read_csv('df_lem_tags.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [29]:
df_lem_tags.shape

(1087806, 49)

In [30]:
df_lem_tags1=df_lem_tags.dropna(how='all')

In [31]:
unique_id.shape

(1087806, 3)

In [32]:
id_lem=pd.merge(left=unique_id, right=df_lem_tags1, left_index=True, right_index=True, how='inner')

In [33]:
id_lem.shape

(1086664, 52)

In [34]:
# id_lem.to_csv('bow_nonull_rows.csv', index=False)

In [35]:
id_lem.head()

Unnamed: 0,index,member_id,bio_x,0,1,2,3,4,5,6,...,39,40,41,42,43,44,45,46,47,48
0,0,3,"Hi, I'm Matt. I'm an entrepreneur who has star...",hi,im,matt,im,entrepreneur,started,four,...,,,,,,,,,,
1,1,6,i was hoping for a meetup like this. i studied...,hoping,meetup,like,studied,human,factor,engineering,...,,,,,,,,,,
2,2,36,not_found,notfound,,,,,,,...,,,,,,,,,,
3,5,65,on the Go team at Google. work on Camlistore n...,go,team,google,work,camlistore,nearly,full,...,,,,,,,,,,
4,6,82,I write code for a living and occasionally dab...,write,code,living,occasionally,dabble,devops,,...,,,,,,,,,,


In [36]:
id_members_with_bio=id_lem.dropna(thresh=3)

Now we are going to eliminate all the values "not_found" from the data, because we are interested in consistent texts
which will train the model.

In [38]:
members_with_bio=unique_id.loc[unique_id['bio_x'] != 'not_found']

In [39]:
members_with_bio=members_with_bio.reset_index()

In [40]:
members_with_bio.shape

(401154, 4)

In [41]:
bow = []

for clue in members_with_bio['bio_x']:
    sentence = clue.translate(str.maketrans('', '',string.punctuation)).split(' ')
    
    new_sent = []
    for word in sentence:
        new_sent.append(word.lower())
    
    bow.append(new_sent)

In [42]:
bow

[['hi',
  'im',
  'matt',
  'im',
  'an',
  'entrepreneur',
  'who',
  'has',
  'started',
  'four',
  'companies',
  'including',
  'as',
  'a',
  'cofounder',
  'of',
  'meetupcom',
  'im',
  'currently',
  'an',
  'eir',
  'for',
  'polaris',
  'ventures',
  'working',
  'out',
  'of',
  'dogpatch',
  'labs',
  'in',
  'nycnnim',
  'not',
  'a',
  'girl',
  'but',
  'i',
  'want',
  'to',
  'learn',
  'to',
  'code'],
 ['i',
  'was',
  'hoping',
  'for',
  'a',
  'meetup',
  'like',
  'this',
  'i',
  'studied',
  'human',
  'factors',
  'engineering',
  '',
  'business',
  'at',
  'u',
  'of',
  'iowa',
  'have',
  'worked',
  'in',
  'the',
  'nyc',
  'internet',
  'industry',
  'since',
  '1995',
  'founded',
  'meetup',
  'in',
  '2002',
  'where',
  'i',
  'am',
  'a',
  'productfocused',
  'ceo'],
 ['on',
  'the',
  'go',
  'team',
  'at',
  'google',
  'work',
  'on',
  'camlistore',
  'nearly',
  'full',
  'time',
  'now'],
 ['i',
  'write',
  'code',
  'for',
  'a',
  'livi

In [43]:
text_df=pd.DataFrame(bow)

In [44]:
text_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,175,176,177,178,179,180,181,182,183,184
0,hi,im,matt,im,an,entrepreneur,who,has,started,four,...,,,,,,,,,,
1,i,was,hoping,for,a,meetup,like,this,i,studied,...,,,,,,,,,,
2,on,the,go,team,at,google,work,on,camlistore,nearly,...,,,,,,,,,,
3,i,write,code,for,a,living,and,occasionally,dabble,in,...,,,,,,,,,,
4,programmer,curmudgeon,etc,,,,,,,,...,,,,,,,,,,


In [45]:
bow_id_m_df=pd.merge(left=members_with_bio[['member_id']], right=text_df, left_index=True, right_index=True)

In [46]:
bow_id_m_df.head()

Unnamed: 0,member_id,0,1,2,3,4,5,6,7,8,...,175,176,177,178,179,180,181,182,183,184
0,3,hi,im,matt,im,an,entrepreneur,who,has,started,...,,,,,,,,,,
1,6,i,was,hoping,for,a,meetup,like,this,i,...,,,,,,,,,,
2,65,on,the,go,team,at,google,work,on,camlistore,...,,,,,,,,,,
3,82,i,write,code,for,a,living,and,occasionally,dabble,...,,,,,,,,,,
4,176,programmer,curmudgeon,etc,,,,,,,...,,,,,,,,,,


In [47]:
bow_id_m_df.shape

(401154, 186)

In [48]:
bow_id_m_df[0].value_counts().head()

i        76350
hi       57147
im       36994
hello    21095
my       17888
Name: 0, dtype: int64

In [49]:
bow_id_m_df=bow_id_m_df.loc[bow_id_m_df[0] != '']

In [50]:
bow_id_m_df.shape

(400124, 186)

In [51]:
filtered_members=bow_id_m_df.reset_index()

In [52]:
filtered_members.head()

Unnamed: 0,index,member_id,0,1,2,3,4,5,6,7,...,175,176,177,178,179,180,181,182,183,184
0,0,3,hi,im,matt,im,an,entrepreneur,who,has,...,,,,,,,,,,
1,1,6,i,was,hoping,for,a,meetup,like,this,...,,,,,,,,,,
2,2,65,on,the,go,team,at,google,work,on,...,,,,,,,,,,
3,3,82,i,write,code,for,a,living,and,occasionally,...,,,,,,,,,,
4,4,176,programmer,curmudgeon,etc,,,,,,...,,,,,,,,,,


In [53]:
# filtered_members.to_csv('filtered_members.csv', index=False)

In [54]:
filtered_members_sample=filtered_members.sample(n=25000, random_state=1)

In [55]:
# filtered_members_sample.to_csv('filtered_members_sample.csv', index=False)

# Cleaning groups text data

In [56]:
groups=pd.read_csv('groups_cleaned.csv')

In [57]:
groups.head()

Unnamed: 0,group_id,category_id,category_name,city,created,description,group_photo_type,join_mode,members,group_name,organizer_member_id,organizer_name,organizer_photo_type,rating,state,utc_offset,visibility,who
0,6388,14,health/wellbeing,New York,2002-11-21 16:50:46,Those who practice or hold a strong interest i...,event,open,1440,Alternative Health NYC,1513133,Joel E.,member,4.39,NY,-14400,public,Explorers of Health
1,6510,4,community/environment,New York,2003-05-20 14:48:54,The New York Alternative Energy Meetupis for t...,event,open,969,Alternative Energy Meetup,3955940,Yair Greenbaum,member,4.31,NY,-14400,public,Clean Energy Supporters
2,8458,26,pets/animals,New York,2004-03-27 09:55:41,not_found,event,open,2930,NYC Animal Rights,1809940,Santos,member,4.84,NY,-14400,public,Animal Voices
3,8940,29,sci-fi/fantasy,New York,2002-11-16 04:49:16,Welcome to the The New York City Anime Meetup ...,event,open,5080,The New York City Anime Group,2548151,Al Mejias,member,4.46,NY,-14400,public,Anime Fans
4,10104,26,pets/animals,New York,2003-10-22 21:39:49,"We welcome those who support pits, even if you...",event,open,2097,NYC Pit Bull Group,1929168,Amy,member,4.09,NY,-14400,public_limited,"NYC Pits & People, Dog Lovers"


In [58]:
groups.shape

(16310, 18)

In [59]:
groups_with_description=groups.loc[groups['description'] != 'not_found']

In [60]:
groups_with_description.shape

(16284, 18)

In [61]:
groups_with_description=groups_with_description.reset_index()

In [62]:
cols5=['group_id', 'group_name']
groups_names=groups_with_description[cols5]

In [63]:
bow_gr = []

for clue in groups_with_description['description']:
    sentence = clue.translate(str.maketrans('', '',string.punctuation)).split(' ')
    
    new_sent = []
    for word in sentence:
        new_sent.append(word.lower())
    
    bow_gr.append(new_sent)

In [64]:
bow_gr

[['those',
  'who',
  'practice',
  'or',
  'hold',
  'a',
  'strong',
  'interest',
  'in',
  'the',
  'many',
  'fields',
  'of',
  'alternative',
  'medicine',
  'and',
  'health'],
 ['the',
  'new',
  'york',
  'alternative',
  'energy',
  'meetupis',
  'for',
  'those',
  'interested',
  'in',
  'learning',
  'about',
  'alternative',
  'energy',
  'technologies',
  'pv',
  'solarthermal',
  'wind',
  'geothermal',
  'biodiesel',
  'etc',
  'conservation',
  'and',
  'the',
  'alternative',
  'energy',
  'business'],
 ['welcome',
  'to',
  'the',
  'the',
  'new',
  'york',
  'city',
  'anime',
  'meetup',
  'group',
  'the',
  'best',
  'place',
  'in',
  'the',
  'new',
  'york',
  'city',
  'area',
  'where',
  'anime',
  'manga',
  'cosplay',
  'and',
  'video',
  'game',
  'fans',
  'hang',
  'out',
  'and',
  'talk',
  'about',
  'the',
  'latest',
  'imports',
  'and',
  'conventions',
  'eat',
  'some',
  'sushi',
  'and',
  'sing',
  'some',
  'karaokenour',
  'usual',
  

In [65]:
bow_gr_df=pd.DataFrame(bow_gr)

In [66]:
bow_gr_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2843,2844,2845,2846,2847,2848,2849,2850,2851,2852
0,those,who,practice,or,hold,a,strong,interest,in,the,...,,,,,,,,,,
1,the,new,york,alternative,energy,meetupis,for,those,interested,in,...,,,,,,,,,,
2,welcome,to,the,the,new,york,city,anime,meetup,group,...,,,,,,,,,,
3,we,welcome,those,who,support,pits,even,if,you,arent,...,,,,,,,,,,
4,hello,,michael,kim,jamal,riegelman,the,director,of,the,...,,,,,,,,,,


In [67]:
bow_id_g_df=pd.merge(left=groups_with_description[['group_id']], right=bow_gr_df, left_index=True, right_index=True

SyntaxError: unexpected EOF while parsing (<ipython-input-67-1ae0894399db>, line 1)

In [None]:
bow_id_g_df.head()

In [None]:
bow_id_g_df.shape

In [None]:
bow_id_g_df[0].value_counts().head()

In [None]:
bow_id_g_df=bow_id_g_df.loc[bow_id_g_df[0] != '']

In [None]:
bow_id_g_df.shape

In [None]:
filtered_groups=bow_id_g_df.reset_index()

In [None]:
# filtered_groups.to_csv('filtered_groups.csv', index=False)

In [None]:
titles=pd.merge(left=groups_names, right=filtered_groups, how='right', on='group_id')

In [None]:
titles.head()

In [None]:
# titles.to_csv('titles_groups.csv', index=False)