# Feature extraction from tweets (User profiling)
This notebook extracts gender and age for each user of a certain tweeter dataset.
The extraction is performed using rules and lexicons.

In [0]:
import pandas as pd
from google.colab import drive
import re
import datetime
import math


In [0]:
def extract_age_group(age):  
  if(not math.isnan(float(age))): # checking for nulls. Weird way but true
    age = int(age)
    if(age<=30):
      age_group = "Young"
    elif(age>30 and age<=60):
      age_group = "Middle_aged"
    else:
      age_group = "Elder"
  else:
    age_group = 'nan'
  return age_group

def extract_age_group_df(data_df):
  age_group_list = []  
  for index, row in data_df.iterrows(): 
    age_group = extract_age_group(row['age'])
    age_group_list.append(age_group)
  age_group_ser = pd.Series(age_group_list)
  return age_group_ser

def extract_age(name, username):
  name_and_username = name + " " + username
  now = datetime.datetime.now()
  age = 'nan'
  try:  
      if(re.search("(?<!\d)(\d{2}|\d{4})(?!\d)(?!%)", name_and_username)):
        birth_date = int(re.findall("(?<!\d)(\d{2}|\d{4})(?!\d)", name_and_username)[0])
        if(birth_date > 1950 and birth_date < 2000):
          age = now.year - birth_date
        if(birth_date > 50 and birth_date <100):
          age = now.year - int('19'+str(birth_date))
  except:
      return age
  return age

def extract_age_df(data_df):  
  age_list = []  
  for index, row in data_df.iterrows():
    age = extract_age(str(row['username']),str(row['screen_name']))  
    age_list.append(age)  
  age_ser = pd.Series(age_list)   
  return age_ser
    
def extract_gender_from_decription(description):
  male_nouns = ["actor","author","boy","brother","dad","daddy","man","father","grandfather","husband","king","man","sir","son","uncle","wizard","waiter", "guy"]
  female_nouns = ["actress","authoress","girl","bride","sister","mum","mummy","woman","mother","goddess","grandmother","grandmom","wife","queen","woman","madam","daughter","aunt","witch","waitress"]
  gender = 'nan'
  male_nouns_n = 0 
  female_nouns_n = 0 
  for noun in male_nouns:    
    if(re.search(rf'\b({noun})\b', description,re.IGNORECASE)):
      male_nouns_n +=1
  for noun in female_nouns:
    if(re.search(rf'\b({noun})\b', description,re.IGNORECASE)):
      female_nouns_n +=1      
  if(male_nouns_n > female_nouns_n):
    gender = "Male"
  if(female_nouns_n > male_nouns_n):
    gender = "Female"
  return gender

def extract_gender_from_username(username):
  file = "/content/drive/My Drive/Colab Notebooks/Tweeter_user_profiling/name_gender.csv"
  names_df = pd.read_csv(file)
  gender = 'nan'
  for name_idx, name_row in names_df.iterrows():    
    if(re.search(rf'\b({name_row["name"]})\b', username,re.IGNORECASE)):
      if(name_row['gender'] == 'M'):
        gender = "Male"
      elif(name_row['gender'] == 'F'):   
        gender = "Female"
  return gender

def extract_gender(username, description):
  gender = extract_gender_from_username(username)
  if gender == 'nan':
    gender = extract_gender_from_decription(description)
  return gender  

def extract_gender_df(data_df):
  gender_list = []
  for index, row in data_df.iterrows():
    gender = extract_gender(str(row['username']), str(row['user_description']))  
    gender_list.append(gender)    
  gender_ser = pd.Series(gender_list)
  
  return gender_ser       

  

In [0]:

file = "/content/drive/My Drive/Colab Notebooks/Tweeter_user_profiling/tweet_immigration_db.csv"
drive.mount('/content/drive')
raw_data_df = pd.read_csv(file)

n_rows = 60
raw_data_df = raw_data_df.iloc[:n_rows] 

raw_data_df['gender'] = extract_gender_df(raw_data_df)
raw_data_df['age'] = extract_age_df(raw_data_df) 
raw_data_df['age_group'] = extract_age_group_df(raw_data_df)    
  
raw_data_df.head(n_rows) 
  

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,keyword,timestamp,username,screen_name,user_location,user_description,user_followers_count,user_friends_count,user_favourites_count,text,place,coordinates,favorite_count,hashtags,retweet_count,tokenized_features,gender,age,age_group
0,immigration,2020-03-10 20:02:02.000000000,Eva Almea,AlmeaEva,"Boston,MA",,2,53,3,RT @HillaryClinton: Any human being can spread...,,,0,[''],2394,"['human', 'spread', 'virus', 'whether', 'citiz...",Female,,
1,immigration,2020-03-10 20:02:02.000000000,Richard Schumacher,rspyboy,"Meridian, Idaho","RN,retired; husband; father; grandfather; Holy...",240,536,41288,RT @Imm_Judges_NAIJ: EOIR has ordered immigrat...,,,0,[''],8995,"['EOIR', 'ordered', 'immigration', 'court', 's...",Male,,
2,immigration,2020-03-10 20:02:02.000000000,Ashlea Kosikowski 🎙🎥,AshleaOnAir,"Wilmington, NC","@WECTNews First at Four & News at 10 Anchor, M...",5284,4196,6171,RT @FridaGhitis: Trump administration orders i...,,,0,['coronavirus'],1120,"['Trump', 'administration', 'orders', 'immigra...",Female,,
3,immigration,2020-03-10 20:02:02.000000000,Demeralda,demeralda,"Michigan, USA",Political junkie. 6th generation Michigander. ...,2256,4967,257124,RT @bulldoghill: She had family in the U.S. an...,,,0,[''],5,"['family', 'US', 'passed', 'credible', 'fear',...",,,
4,immigration,2020-03-10 20:02:02.000000000,Caroline #BoycottNRA🚫,carnmcgrath,"Marietta, GA",sometimes when things are falling apart they m...,4524,5002,108407,RT @BradBeauregardJ: The Trump administration ...,,,0,[''],19,"['Trump', 'administration', 'wants', 'immigran...",Female,,
5,immigration,2020-03-10 20:02:02.000000000,❌❌❌T A G❌❌,Gerat1t,"South West, England","Normal patriotic, educated and intelligent guy...",1842,2578,97526,RT @PaulMer52: ANYONE ONE SEEN THIS MAN. \nWAN...,,,0,[''],9,"['ANYONE', 'ONE', 'SEEN', 'MAN', 'WANTED', 'LE...",Male,,
6,migrant,2020-03-10 20:02:02.000000000,Vivian ⭐️⭐️⭐️,vivianmtl,Montreal,#Habs #Trump #MAGA #Q #KAG #Istand #RainMakers...,15545,15587,166310,RT @DarrenPlymouth: #Greece is building a seco...,,,0,['Greece'],170,"['#Greece', 'building', 'secondary', 'border',...",Female,,
7,migrant,2020-03-10 20:02:02.000000000,Ric Adams,RicardoAdams,"Moorestown, NJ",#MAGA #KAG #NRA #censorship #FreeOwen #FreeRo...,8215,9010,88684,"RT @brollejrhoff: Greece:"" no to møsques, no t...",,,0,[''],56,"['Greece', 'no', 'sques', 'no', 'hospit', 'lit...",Male,,
8,migrant,2020-03-10 20:02:02.000000000,Moyin Adegbie,MoynAdegbie,Somewhere on the lithosphere.,@ work 24/7 365\nI follow me back😎👍,174,164,18679,RT @BubetteS: Three Swedish girls restrain cri...,,,0,[''],348,"['Three', 'Swedish', 'girls', 'restrain', 'cri...",,,
9,migrant,2020-03-10 20:02:02.000000000,Alan Williams,willy1805,Dartmouth NS,,236,373,3196,RT @Nicolas_in_GTA: @JustinTrudeau @BobRae48 N...,,,0,[''],1,"['No', 'push', 'back', 'barbarians', 'desert',...",Male,,
