In [1]:
import pandas as pd
from nltk.corpus import wordnet
import spacy
import nltk
from nltk.corpus import stopwords
import numpy as np

In [2]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [3]:
voc = pd.read_excel('vocabulary.xlsx',sheet_name='Vocabulary Clean')
voc.drop(index = voc['Original'][voc['Original']!=voc['Original']].index, axis=0, inplace=True)
voc.replace(np.nan, '',inplace=True)
repl_words = {o.strip():c.strip() for o,c in voc.values}

In [4]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

nlp = spacy.load('en_core_web_lg')

In [5]:
def text_process(text):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Extract the lemma for each token and join
    4. Returns a list of the cleaned text
    """
   
    text = str(text)

    # Check characters to see if they are in punctuation
    try:
        nopunc = [char for char in text if char not in string.punctuation]
    except:
        pass
    
    # Join the characters again to form the string and parse the sentence using the loaded 'en' model object `nlp`
    try:
        nopunc = str(nlp(''.join(nopunc)))
    except:
        nopunc = text
        pass

    myList = [lemmatizer.lemmatize(w, get_wordnet_pos(w)).lower() for w in nltk.word_tokenize(nopunc)if (w.lower() not in stopwords.words('english'))]
    
    myList = [w for w in myList if w in repl_words.keys() ]
    # Now just lemmatize and remove any stopwords
    return ' '.join([repl_words[w] if repl_words[w]!= '' else w for w in myList])

In [6]:
original = pd.read_excel('Sherloc_dataset.xlsx')

In [7]:
original.head()

Unnamed: 0,ID,Page_Title,URL,Fact_Summary,Language,Cross_Cutting,Country,Corruption,Counterfeiting,Criminal_group,...,Offending_United Arab Emirates,Offending_United Kingdom of Great Britain and Northern Ireland,Offending_United Republic of Tanzania,Offending_United States of America,Offending_Uruguay,Offending_Uzbekistan,Offending_Venezuela (Bolivarian Republic of),Offending_Viet Nam,Offending_Yemen,Offending_Zambia
0,ALB001,Decision No. 648,https://sherloc.unodc.org/cld/case-law-doc/tra...,In the months of July and August of 2005 the d...,en,[],Albania,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ALB001,Decision No. 648,https://sherloc.unodc.org/cld/case-law-doc/tra...,In the months of July and August of 2005 the d...,en,[],Albania,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ALB002,Decision No.292,https://sherloc.unodc.org/cld/case-law-doc/tra...,Defendant Genc Hysa and S K moved in together ...,en,[],,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ALB002,Decision No.292,https://sherloc.unodc.org/cld/case-law-doc/tra...,Defendant Genc Hysa and S K moved in together ...,en,[],,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ALB002,Decision No.292,https://sherloc.unodc.org/cld/case-law-doc/tra...,Defendant Genc Hysa and S K moved in together ...,en,[],,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df = original.drop_duplicates('ID', ignore_index=True)[['ID', 'Fact_Summary','Language','Trafficking_Persons', 'Migrant_Smuggling']]

In [9]:
df.shape

(3170, 5)

In [10]:
df.drop(index = df[df["Language"]!='en'].index, inplace= True, axis=0)

In [11]:
df = df.loc[(df['Trafficking_Persons']==1) | (df['Migrant_Smuggling']==1)].copy()

In [12]:
df.head()

Unnamed: 0,ID,Fact_Summary,Language,Trafficking_Persons,Migrant_Smuggling
0,ALB001,In the months of July and August of 2005 the d...,en,1,0
1,ALB002,Defendant Genc Hysa and S K moved in together ...,en,1,0
2,ALB003,The defendant met the victim in 2001 and they ...,en,1,0
4,ALB005,The defendant lured the victim to the Albanian...,en,1,0
5,ALB006,The defendant approached the victim s family a...,en,1,0


In [13]:
summaries = df.pop('Fact_Summary')

In [24]:
summaries[4]

'The defendant lured the victim to the Albanian city of Korca with the promise of marriage From there the defendant brought the victim to Greece using forged passports with Greek visas The victim was forced into prostitution and was captured by Greek police and deported back to Albania on 29 May 2004'

In [26]:
text_process(summaries[4])

'defendant lure victim albanian city promise marriage defendant brought victim greece use forge passport greek visa victim force prostitution capture greek police deport albania'

In [16]:
sums = summaries.apply(text_process)

In [18]:
len(sums.to_list())

2305

In [56]:
with open('sums.txt','w') as f:
    f.write(str(sums.to_list()))