In [1]:
import nltk.corpus
import bs4
import numpy as np
import pandas as pd
import re
import json
import sys
import nltk
nltk.download('cmudict')


[nltk_data] Downloading package cmudict to /Users/lega/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [2]:


STOPWORDS = set(nltk.corpus.stopwords.words('english'))  # a list of stopwords
CMU_DICT = nltk.corpus.cmudict.dict()  # a dictionary of English


def add_wordlist_cols(df, text_col='text', terms_json='data/fe_dicts.json'):
    """
    Given a dataframe with a column containing text and a JSON of variables and associated term wordlists, return the
    dataframe with additional columns for each variable with values being the words found in each text.

    Args:
         df         -- the ambient dataframe
         text_col   -- the name of the column containing the text to analyse
         terms_json -- the path to a JSON file of variables and associated terms
    """
    with open(terms_json, 'r') as f:
        term_dict = json.load(f)
        return pd.concat([df, pd.DataFrame(list(df[text_col].apply(lambda x: _extract_terms(x, term_dict))), index=df.index)], axis=1)


def get_wordlists_for_text(text, terms_json='../constants/terms.json'):
    """
    Given a single text string and a JSON of variables and associated term wordlists, return a dictionary with each
    variable and the corresponding terms from each wordlist found in the text.

    Args:
         text       -- input string to analyse
         terms_json -- the path to a JSON file of variables and associated terms
    """
    with open(terms_json, 'r') as f:
        term_dict = json.load(f)
        return _extract_terms(text, term_dict)


def _extract_terms(text, term_dict):
    """
    Given a text and a dictionary of variables and associated term wordlists, return a dictionary with the words found
    in the text for each variable.

    Args:
         text      -- text to analyse
         term_dict -- a dictionary with keys being the variables and values being lists of associated words
    """
    return {t: [x for x in term_dict[t] if (len(re.findall("\\b{}\\b".format(str(x.lower())), text.lower())) > 0)] for t in term_dict}





In [3]:
sys.path.append('../')


from ht_project import nlp_utils

TERMS_JSON = '/Users/lega/code/QZKZ3/ht_project/ht_project/data/fe_dicts.json'
TEXT_FILENAME = '/Users/lega/code/QZKZ3/ht_project/raw_data/final_dataframe.csv'

#TO DO: should output ones and zeros instead of matches

df = pd.read_csv(TEXT_FILENAME, delimiter = ',' , encoding = 'utf-8')

df_nu = add_wordlist_cols(df[0:10], text_col='text', terms_json=TERMS_JSON)

#df.to_csv("raw_data/feature_eng_df.csv")




In [4]:
df

Unnamed: 0.1,Unnamed: 0,text,name,year,departure,arrival,theme,latitude,longitude,date_slavery,gender,accuracy
0,0,My name is Vi. And I am 28 years old. I arrive...,Vi,2001.0,Vietnam,American Samoa,Forced labour,-14.354000,-170.773600,,female,67.0
1,1,Thank you for the opportunity to testify befor...,Beatrice,2005.0,Sri Lank,Lebanon,Domestic slavery,33.900000,35.500000,,female,98.0
2,2,My sister and I were living in a village. We w...,Kavita,2004.0,Sri Lank,India,Domestic slavery,24.071318,79.652396,2002-2004,female,97.0
3,3,I grew up in a village of between 500 and 700 ...,Maria,2005.0,Mexico,United States,Domestic slavery,40.412044,-96.870925,1976-1981,female,100.0
4,4,"I was living in a village, Karko, in the Nuba ...",Mende,2003.0,Sudan,"['Sudan ', 'United Kingdom ']",War slavery,16.865226,32.271106,1994-2000,male,90.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1353,1353,"My name is Suleiman Ali, I’m Sudanese. I want ...",Suleiman,2020,Egypt,Libya,Trafficking,26.335100,17.228331,,male,100.0
1354,1354,"In Nigeria, it was difficult to make a living....",Shola,2020,Nigeria,Libya,too much themes,26.335100,17.228331,,male,80.0
1355,1355,"My name is Seif Eldein. I’m 24 years old, from...",Seif,2020,Chad,Libya,"['Trafficking', 'Forced labour']",26.335100,17.228331,,male,100.0
1356,1356,I was begging and sleeping in the street behin...,Rita,2020,Nigeria,"['Italy ', 'Libya ']","['Sexual exploitation', 'Trafficking']",43.769562,11.255814,,female,100.0


In [5]:
df_nu.iloc[:,33:]

Unnamed: 0,recruiterRelationIntimatePartner,recruiterRelationFamily,rec_method_social_media,traf_type_child,isSexualExploit,isForcedLabour,isForcedMarriage,traf_type_cmarriage,traf_type_domestic,traf_type_fcriminality,...,transport_truck,age_0_to_8,age_9_to_17,age_18_to_20,age_21_to_23,age_24_to_26,age_27_to_29,age_30-38,age_39_47,age_48+
0,[],[],[],[],"[sex, sleep with]",[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
1,[],[],[],[],[],"[carpets, cleaners]",[],[],"[maid, chores]",[],...,[],[],[],[],[],[],[],[],[],[]
2,[],[],[],[],[],[],[],[],[],[steal],...,[],[],[I was 12],[],[],[],[],[],[],[]
3,[],[],[],[],[raped],"[factory, farm, cleaning]",[],[],[cleaning],[],...,[],[],[I was 15],[],[],[],[],[],[],[]
4,[],[],[],[],[raped],[cleaning],[],[],"[domestic, cleaning, clean the house]",[],...,[],[],[I was 12],[],[],[],[],[],[],[]
5,[],[],[],[],[],[loom],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
6,[],[],[],[],[raped],[],[],[],[],[],...,"[truck, lorries]",[],[],[],[],[],[],[],[],[]
7,[],[],[],[],[],[cleaning],[],[],"[ironing, cooking, cleaning, cooking, laundry]",[],...,[],[],[],[],[],[],[],[],[],[]
8,[],[born into],[],[infancy],"[sexual, prostitution, brothels, pornography, ...",[],[],[],[],[],...,[],[I was one],[],[],[],[],[],[],[],[]
9,[],[],[],[],[],"[cleaning, cleaned]",[],[],"[domestic, cleaning, iron, babysitting, sweep,...",[],...,[],[],[],[],[],[],[],[],[],[]


In [6]:
df_cols_added = add_wordlist_cols(df, text_col='text', terms_json=TERMS_JSON)

In [7]:
df_cols_added

Unnamed: 0.1,Unnamed: 0,text,name,year,departure,arrival,theme,latitude,longitude,date_slavery,...,transport_truck,age_0_to_8,age_9_to_17,age_18_to_20,age_21_to_23,age_24_to_26,age_27_to_29,age_30-38,age_39_47,age_48+
0,0,My name is Vi. And I am 28 years old. I arrive...,Vi,2001.0,Vietnam,American Samoa,Forced labour,-14.354000,-170.773600,,...,[],[],[],[],[],[],[],[],[],[]
1,1,Thank you for the opportunity to testify befor...,Beatrice,2005.0,Sri Lank,Lebanon,Domestic slavery,33.900000,35.500000,,...,[],[],[],[],[],[],[],[],[],[]
2,2,My sister and I were living in a village. We w...,Kavita,2004.0,Sri Lank,India,Domestic slavery,24.071318,79.652396,2002-2004,...,[],[],[I was 12],[],[],[],[],[],[],[]
3,3,I grew up in a village of between 500 and 700 ...,Maria,2005.0,Mexico,United States,Domestic slavery,40.412044,-96.870925,1976-1981,...,[],[],[I was 15],[],[],[],[],[],[],[]
4,4,"I was living in a village, Karko, in the Nuba ...",Mende,2003.0,Sudan,"['Sudan ', 'United Kingdom ']",War slavery,16.865226,32.271106,1994-2000,...,[],[],[I was 12],[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1353,1353,"My name is Suleiman Ali, I’m Sudanese. I want ...",Suleiman,2020,Egypt,Libya,Trafficking,26.335100,17.228331,,...,[],[],[],[],[],[],[],[],[],[]
1354,1354,"In Nigeria, it was difficult to make a living....",Shola,2020,Nigeria,Libya,too much themes,26.335100,17.228331,,...,[],[],[],[],[],[],[],[],[],[]
1355,1355,"My name is Seif Eldein. I’m 24 years old, from...",Seif,2020,Chad,Libya,"['Trafficking', 'Forced labour']",26.335100,17.228331,,...,[],[],[],[],[],[],[],[],[],[]
1356,1356,I was begging and sleeping in the street behin...,Rita,2020,Nigeria,"['Italy ', 'Libya ']","['Sexual exploitation', 'Trafficking']",43.769562,11.255814,,...,[truck],[],[],[],[],[],[],[],[],[]


In [8]:
df_cols_added.to_csv("/Users/lega/Documents/feature_eng_df_multi_cols_3.csv", index=False)

In [9]:
pd.read_csv('/Users/lega/Documents/feature_eng_df_multi_cols_3.csv')

Unnamed: 0.1,Unnamed: 0,text,name,year,departure,arrival,theme,latitude,longitude,date_slavery,...,transport_truck,age_0_to_8,age_9_to_17,age_18_to_20,age_21_to_23,age_24_to_26,age_27_to_29,age_30-38,age_39_47,age_48+
0,0,My name is Vi. And I am 28 years old. I arrive...,Vi,2001.0,Vietnam,American Samoa,Forced labour,-14.354000,-170.773600,,...,[],[],[],[],[],[],[],[],[],[]
1,1,Thank you for the opportunity to testify befor...,Beatrice,2005.0,Sri Lank,Lebanon,Domestic slavery,33.900000,35.500000,,...,[],[],[],[],[],[],[],[],[],[]
2,2,My sister and I were living in a village. We w...,Kavita,2004.0,Sri Lank,India,Domestic slavery,24.071318,79.652396,2002-2004,...,[],[],['I was 12'],[],[],[],[],[],[],[]
3,3,I grew up in a village of between 500 and 700 ...,Maria,2005.0,Mexico,United States,Domestic slavery,40.412044,-96.870925,1976-1981,...,[],[],['I was 15'],[],[],[],[],[],[],[]
4,4,"I was living in a village, Karko, in the Nuba ...",Mende,2003.0,Sudan,"['Sudan ', 'United Kingdom ']",War slavery,16.865226,32.271106,1994-2000,...,[],[],['I was 12'],[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1353,1353,"My name is Suleiman Ali, I’m Sudanese. I want ...",Suleiman,2020,Egypt,Libya,Trafficking,26.335100,17.228331,,...,[],[],[],[],[],[],[],[],[],[]
1354,1354,"In Nigeria, it was difficult to make a living....",Shola,2020,Nigeria,Libya,too much themes,26.335100,17.228331,,...,[],[],[],[],[],[],[],[],[],[]
1355,1355,"My name is Seif Eldein. I’m 24 years old, from...",Seif,2020,Chad,Libya,"['Trafficking', 'Forced labour']",26.335100,17.228331,,...,[],[],[],[],[],[],[],[],[],[]
1356,1356,I was begging and sleeping in the street behin...,Rita,2020,Nigeria,"['Italy ', 'Libya ']","['Sexual exploitation', 'Trafficking']",43.769562,11.255814,,...,['truck'],[],[],[],[],[],[],[],[],[]
