In [34]:
import nltk.corpus
import bs4
import numpy as np
import pandas as pd
import re
import json
import sys
import nltk
nltk.download('cmudict')


[nltk_data] Downloading package cmudict to /Users/lega/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [35]:


STOPWORDS = set(nltk.corpus.stopwords.words('english'))  # a list of stopwords
CMU_DICT = nltk.corpus.cmudict.dict()  # a dictionary of English


def add_wordlist_cols(df, text_col='text', terms_json='data/fe_dicts.json'):
    """
    Given a dataframe with a column containing text and a JSON of variables and associated term wordlists, return the
    dataframe with additional columns for each variable with values being the words found in each text.

    Args:
         df         -- the ambient dataframe
         text_col   -- the name of the column containing the text to analyse
         terms_json -- the path to a JSON file of variables and associated terms
    """
    with open(terms_json, 'r') as f:
        term_dict = json.load(f)
        return pd.concat([df, pd.DataFrame(list(df[text_col].apply(lambda x: _extract_terms(x, term_dict))), index=df.index)], axis=1)


def get_wordlists_for_text(text, terms_json='../constants/terms.json'):
    """
    Given a single text string and a JSON of variables and associated term wordlists, return a dictionary with each
    variable and the corresponding terms from each wordlist found in the text.

    Args:
         text       -- input string to analyse
         terms_json -- the path to a JSON file of variables and associated terms
    """
    with open(terms_json, 'r') as f:
        term_dict = json.load(f)
        return _extract_terms(text, term_dict)


def _extract_terms(text, term_dict):
    """
    Given a text and a dictionary of variables and associated term wordlists, return a dictionary with the words found
    in the text for each variable.

    Args:
         text      -- text to analyse
         term_dict -- a dictionary with keys being the variables and values being lists of associated words
    """
    return {t: [x for x in term_dict[t] if (len(re.findall("\\b{}\\b".format(str(x.lower())), text.lower())) > 0)] for t in term_dict}





In [49]:
sys.path.append('../')


from ht_project import nlp_utils

TERMS_JSON = '/Users/lega/code/QZKZ3/ht_project/ht_project/data/fe_dicts.json'
TEXT_FILENAME = '/Users/lega/code/QZKZ3/ht_project/raw_data/final_dataframe.csv'

#TO DO: should output ones and zeros instead of matches

df = pd.read_csv(TEXT_FILENAME, delimiter = ',' , encoding = 'utf-8')

df_nu = add_wordlist_cols(df[0:10], text_col='text', terms_json=TERMS_JSON)

#df.to_csv("raw_data/feature_eng_df.csv")




In [50]:
df

Unnamed: 0.1,Unnamed: 0,text,name,year,departure,arrival,theme,latitude,longitude,date_slavery,gender,accuracy
0,0,My name is Vi. And I am 28 years old. I arrive...,Vi,2001.0,Vietnam,American Samoa,Forced labour,-14.354000,-170.773600,,female,67.0
1,1,Thank you for the opportunity to testify befor...,Beatrice,2005.0,Sri Lank,Lebanon,Domestic slavery,33.900000,35.500000,,female,98.0
2,2,My sister and I were living in a village. We w...,Kavita,2004.0,Sri Lank,India,Domestic slavery,24.071318,79.652396,2002-2004,female,97.0
3,3,I grew up in a village of between 500 and 700 ...,Maria,2005.0,Mexico,United States,Domestic slavery,40.412044,-96.870925,1976-1981,female,100.0
4,4,"I was living in a village, Karko, in the Nuba ...",Mende,2003.0,Sudan,"['Sudan ', 'United Kingdom ']",War slavery,16.865226,32.271106,1994-2000,male,90.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1353,1353,"My name is Suleiman Ali, I’m Sudanese. I want ...",Suleiman,2020,Egypt,Libya,Trafficking,26.335100,17.228331,,male,100.0
1354,1354,"In Nigeria, it was difficult to make a living....",Shola,2020,Nigeria,Libya,too much themes,26.335100,17.228331,,male,80.0
1355,1355,"My name is Seif Eldein. I’m 24 years old, from...",Seif,2020,Chad,Libya,"['Trafficking', 'Forced labour']",26.335100,17.228331,,male,100.0
1356,1356,I was begging and sleeping in the street behin...,Rita,2020,Nigeria,"['Italy ', 'Libya ']","['Sexual exploitation', 'Trafficking']",43.769562,11.255814,,female,100.0


In [51]:
df_nu

Unnamed: 0.1,Unnamed: 0,text,name,year,departure,arrival,theme,latitude,longitude,date_slavery,...,transport_truck,age_0_to_8,age_9_to_17,age_18_to_20,age_21_to_23,age_24_to_26,age_27_to_29,age_30-38,age_39_47,age_48+
0,0,My name is Vi. And I am 28 years old. I arrive...,Vi,2001.0,Vietnam,American Samoa,Forced labour,-14.354,-170.7736,,...,[],[],[],[],[],[],[],[],[],[]
1,1,Thank you for the opportunity to testify befor...,Beatrice,2005.0,Sri Lank,Lebanon,Domestic slavery,33.9,35.5,,...,[],[],[],[],[],[],[],[],[],[]
2,2,My sister and I were living in a village. We w...,Kavita,2004.0,Sri Lank,India,Domestic slavery,24.071318,79.652396,2002-2004,...,[],[],[I was 12],[],[],[],[],[],[],[]
3,3,I grew up in a village of between 500 and 700 ...,Maria,2005.0,Mexico,United States,Domestic slavery,40.412044,-96.870925,1976-1981,...,[],[],[I was 15],[],[],[],[],[],[],[]
4,4,"I was living in a village, Karko, in the Nuba ...",Mende,2003.0,Sudan,"['Sudan ', 'United Kingdom ']",War slavery,16.865226,32.271106,1994-2000,...,[],[],[I was 12],[],[],[],[],[],[],[]
5,5,I was fortunate to work for a little period at...,Ashok,2004.0,Sudan,India,Forced labour,24.071318,79.652396,,...,[],[],[],[],[],[],[],[],[],[]
6,6,My name is Choti. I think I’m about 20 years o...,Choti,2004.0,Sudan,India,"['Forced labour', 'Debt bondage']",20.593683,78.962883,,...,"[truck, lorries]",[],[],[],[],[],[],[],[],[]
7,7,My name is Christina Elangwe. I was born in a ...,Christina,2005.0,Cameroon,United States,Domestic slavery,40.412044,-96.870925,,...,[],[],[],[],[],[],[],[],[],[]
8,8,I was born a slave. I was born into sexual sla...,Christine,1997.0,United States,United States,Sexual exploitation,40.412044,-96.870925,,...,[],[I was one],[],[],[],[],[],[],[],[]
9,9,Anyone of you sitting here at this very moment...,Jean-Robert,2002.0,Haiti,Haiti,Domestic slavery,19.093863,-72.219309,,...,[],[],[],[],[],[],[],[],[],[]


In [52]:
df_cols_added = add_wordlist_cols(df, text_col='text', terms_json=TERMS_JSON)

In [53]:
df_cols_added

Unnamed: 0.1,Unnamed: 0,text,name,year,departure,arrival,theme,latitude,longitude,date_slavery,...,transport_truck,age_0_to_8,age_9_to_17,age_18_to_20,age_21_to_23,age_24_to_26,age_27_to_29,age_30-38,age_39_47,age_48+
0,0,My name is Vi. And I am 28 years old. I arrive...,Vi,2001.0,Vietnam,American Samoa,Forced labour,-14.354000,-170.773600,,...,[],[],[],[],[],[],[],[],[],[]
1,1,Thank you for the opportunity to testify befor...,Beatrice,2005.0,Sri Lank,Lebanon,Domestic slavery,33.900000,35.500000,,...,[],[],[],[],[],[],[],[],[],[]
2,2,My sister and I were living in a village. We w...,Kavita,2004.0,Sri Lank,India,Domestic slavery,24.071318,79.652396,2002-2004,...,[],[],[I was 12],[],[],[],[],[],[],[]
3,3,I grew up in a village of between 500 and 700 ...,Maria,2005.0,Mexico,United States,Domestic slavery,40.412044,-96.870925,1976-1981,...,[],[],[I was 15],[],[],[],[],[],[],[]
4,4,"I was living in a village, Karko, in the Nuba ...",Mende,2003.0,Sudan,"['Sudan ', 'United Kingdom ']",War slavery,16.865226,32.271106,1994-2000,...,[],[],[I was 12],[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1353,1353,"My name is Suleiman Ali, I’m Sudanese. I want ...",Suleiman,2020,Egypt,Libya,Trafficking,26.335100,17.228331,,...,[],[],[],[],[],[],[],[],[],[]
1354,1354,"In Nigeria, it was difficult to make a living....",Shola,2020,Nigeria,Libya,too much themes,26.335100,17.228331,,...,[],[],[],[],[],[],[],[],[],[]
1355,1355,"My name is Seif Eldein. I’m 24 years old, from...",Seif,2020,Chad,Libya,"['Trafficking', 'Forced labour']",26.335100,17.228331,,...,[],[],[],[],[],[],[],[],[],[]
1356,1356,I was begging and sleeping in the street behin...,Rita,2020,Nigeria,"['Italy ', 'Libya ']","['Sexual exploitation', 'Trafficking']",43.769562,11.255814,,...,[truck],[],[],[],[],[],[],[],[],[]


In [54]:
df_cols_added = df.drop(columns=['Unnamed: 0'])

In [55]:
df_cols_added

Unnamed: 0,text,name,year,departure,arrival,theme,latitude,longitude,date_slavery,gender,accuracy
0,My name is Vi. And I am 28 years old. I arrive...,Vi,2001.0,Vietnam,American Samoa,Forced labour,-14.354000,-170.773600,,female,67.0
1,Thank you for the opportunity to testify befor...,Beatrice,2005.0,Sri Lank,Lebanon,Domestic slavery,33.900000,35.500000,,female,98.0
2,My sister and I were living in a village. We w...,Kavita,2004.0,Sri Lank,India,Domestic slavery,24.071318,79.652396,2002-2004,female,97.0
3,I grew up in a village of between 500 and 700 ...,Maria,2005.0,Mexico,United States,Domestic slavery,40.412044,-96.870925,1976-1981,female,100.0
4,"I was living in a village, Karko, in the Nuba ...",Mende,2003.0,Sudan,"['Sudan ', 'United Kingdom ']",War slavery,16.865226,32.271106,1994-2000,male,90.0
...,...,...,...,...,...,...,...,...,...,...,...
1353,"My name is Suleiman Ali, I’m Sudanese. I want ...",Suleiman,2020,Egypt,Libya,Trafficking,26.335100,17.228331,,male,100.0
1354,"In Nigeria, it was difficult to make a living....",Shola,2020,Nigeria,Libya,too much themes,26.335100,17.228331,,male,80.0
1355,"My name is Seif Eldein. I’m 24 years old, from...",Seif,2020,Chad,Libya,"['Trafficking', 'Forced labour']",26.335100,17.228331,,male,100.0
1356,I was begging and sleeping in the street behin...,Rita,2020,Nigeria,"['Italy ', 'Libya ']","['Sexual exploitation', 'Trafficking']",43.769562,11.255814,,female,100.0


In [31]:
df_cols_added.to_csv("/Users/lega/Documents/feature_eng_df_multi_cols.csv", index=False)

In [32]:
pd.read_csv('/Users/lega/Documents/feature_eng_df_multi_cols.csv')

Unnamed: 0,text,name,year,departure,arrival,theme,latitude,longitude,date_slavery,gender,accuracy
0,My name is Vi. And I am 28 years old. I arrive...,Vi,2001.0,Vietnam,American Samoa,Forced labour,-14.354000,-170.773600,,female,67.0
1,Thank you for the opportunity to testify befor...,Beatrice,2005.0,Sri Lank,Lebanon,Domestic slavery,33.900000,35.500000,,female,98.0
2,My sister and I were living in a village. We w...,Kavita,2004.0,Sri Lank,India,Domestic slavery,24.071318,79.652396,2002-2004,female,97.0
3,I grew up in a village of between 500 and 700 ...,Maria,2005.0,Mexico,United States,Domestic slavery,40.412044,-96.870925,1976-1981,female,100.0
4,"I was living in a village, Karko, in the Nuba ...",Mende,2003.0,Sudan,"['Sudan ', 'United Kingdom ']",War slavery,16.865226,32.271106,1994-2000,male,90.0
...,...,...,...,...,...,...,...,...,...,...,...
1353,"My name is Suleiman Ali,Â IâmÂ Sudanese. I w...",Suleiman,2020,Egypt,Libya,Trafficking,26.335100,17.228331,,male,100.0
1354,"In Nigeria, it was difficult to make a living....",Shola,2020,Nigeria,Libya,too much themes,26.335100,17.228331,,male,80.0
1355,"My name is SeifÂ Eldein. Iâm 24 years old, f...",Seif,2020,Chad,Libya,"['Trafficking', 'Forced labour']",26.335100,17.228331,,male,100.0
1356,I was begging and sleeping in the street behin...,Rita,2020,Nigeria,"['Italy ', 'Libya ']","['Sexual exploitation', 'Trafficking']",43.769562,11.255814,,female,100.0


In [33]:
pd.read_csv('/Users/lega/Downloads/fin_df_with_url.csv')

Unnamed: 0.1,Unnamed: 0,url,text,name,year,departure,theme,latitude,longitude,date_slavery,gender,accuracy,arrival_1,arrival_2,arrival_3
0,0,http://antislavery.ac.uk/items/show/7,My name is Vi. And I am 28 years old. I arrive...,Vi,2001.0,Vietnam,Forced labour,-14.354000,-170.773600,,female,67.0,American Samoa,,
1,1,http://antislavery.ac.uk/items/show/8,Thank you for the opportunity to testify befor...,Beatrice,2005.0,Sri Lank,Domestic slavery,33.900000,35.500000,,female,98.0,Lebanon,,
2,2,http://antislavery.ac.uk/items/show/102,My sister and I were living in a village. We w...,Kavita,2004.0,Sri Lank,Domestic slavery,24.071318,79.652396,2002-2004,female,97.0,India,,
3,3,http://antislavery.ac.uk/items/show/103,I grew up in a village of between 500 and 700 ...,Maria,2005.0,Mexico,Domestic slavery,40.412044,-96.870925,1976-1981,female,100.0,United States,,
4,4,http://antislavery.ac.uk/items/show/104,"I was living in a village, Karko, in the Nuba ...",Mende,2003.0,Sudan,War slavery,16.865226,32.271106,1994-2000,male,90.0,Sudan,United Kingdom,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1354,1354,http://antislavery.ac.uk/items/show/2950,"In Nigeria, it was difficult to make a living....",Shola,2020,Nigeria,too much themes,26.335100,17.228331,,male,80.0,Libya,,
1355,1355,http://antislavery.ac.uk/items/show/2951,"My name is Seif Eldein. I’m 24 years old, from...",Seif,2020,Chad,"['Trafficking', 'Forced labour']",26.335100,17.228331,,male,100.0,Libya,,
1356,1356,http://antislavery.ac.uk/items/show/2952,I was begging and sleeping in the street behin...,Rita,2020,Nigeria,"['Sexual exploitation', 'Trafficking']",43.769562,11.255814,,female,100.0,Italy,Libya,
1357,1357,http://antislavery.ac.uk/items/show/2953,I lived with my mother and father and they wer...,Nora,2020,Nigeria,too much themes,41.889499,12.528505,,female,100.0,Italy,Libya,Niger
