In [1]:
import pandas as pd
import datetime as dt
import re
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
farm = pd.read_csv('data/farming.csv', escapechar='\\', parse_dates=['created_at'])
farm['created_at'] = pd.to_datetime(farm['created_at'].dt.date)
print(farm.shape)

(1357648, 9)


In [3]:
empty = farm[farm['content'].isnull()].index.get_values()
farm.drop(empty, inplace=True)

In [4]:
eng = farm[farm['language_code'] == 'eng']
eng = eng.drop('language_code', axis=1)
eng.shape

(852048, 8)

In [5]:
content_size = lambda x: len(str(x).split()) #counting the content size per sample 
punct_remover = lambda x: re.sub(r'[^\w\s]','', str(x)) #removing punctuations
make_lower = lambda x: str(x).lower() #making everything lowercase 

In [6]:
def preproc_1(df):
    df2 = df.copy()
    
    df2['content_size'] = 0
    df2['content_size'] = df2.loc[:,'content'].apply(content_size) #adding the content size as a new column to the df
    df2['content'] = df2.loc[:,'content'].apply(punct_remover) #removing punctuations
    df2['content'] = df2.loc[:,'content'].apply(make_lower) #making words lowercase
    
    df2.set_index(['root_sms_in_id', 'sms_in_id'], inplace=True)
    df2 = df2.sort_index(axis=0, level='root_sms_in_id')
    
    return df2

In [7]:
def preproc_2(df):
    
    df2 = df.copy()
    #collecting indices for samples with content size <=2 
    inds = df[(df['content_size'] <= 2) & (df['cmd'] == 'question')].index.get_values()

    less_than_2 = [] 
    for item in inds:
        less_than_2.append(item[0]) #root_sms_in_id values for samples with content <=2
            
    df2.drop(less_than_2, axis=0, inplace=True)
        
    df2.reset_index(inplace=True)
    
    return df2

In [8]:
new_farm = preproc_1(eng)
new_farm.shape

(852048, 7)

In [9]:
new_farm_2 = preproc_2(new_farm)
new_farm_2.shape

(826356, 9)

In [11]:
#keeping the questions only
questions = new_farm_2[new_farm_2['cmd'] == 'question']

In [12]:
questions = questions.drop_duplicates('content')
questions.shape

(309478, 9)

In [13]:
#bird, cock, poultry will still be left as they are
#Note: below wont catch chick or chic that appear at the end of the sentence (without trailing space).  
#the criteria for changing chick and chic to chicken is only if they appear in the begining or middle of a sentence followed by
#a trailing space, otherwise all the existing chickens would change to 'chickenen'

#spell checker
def pre_proc(df): 
    df2 = df.copy()
    
    df2.content.replace({r'(x*_*[0-9]_*)' : ""}, inplace=True, regex=True) #removing all digits attached to words
    df2.content.replace({r'(a?kukus?)' : "chicken"}, inplace=True, regex=True)
    df2.content.replace({r'(\s+(a?hens?)\s+)' : " chicken "}, inplace=True, regex=True)#starting \s to prevent 'when'->'wchicken'
    df2.content.replace({r'(chic?khens?)' : "chicken"}, inplace=True, regex=True)
    df2.content.replace({r'((a?chic?k?s?)\s+)' : "chicken "}, inplace=True, regex=True) #need \s otherwise risk building chickenen
    df2.content.replace({r'(chic?k?s)' : "chicken"}, inplace=True, regex=True) #correcting sentence ending chicks, ending 's' to avoid chickenen
    df2.content.replace({r'(a?chikens?)' : "chicken"}, inplace=True, regex=True)
    df2.content.replace({r'(a?chikes?)' : "chicken"}, inplace=True, regex=True)
    df2.content.replace({r'(achicken)' : "chicken"}, inplace=True, regex=True)  
    #there are things like kukuchick or chickenchick in data that above will change them to chickenchicken, below line to correct that
    df2.content.replace({r'(chickenchicken)' : "chicken"}, inplace=True, regex=True)  
    df2.content.replace({r'(poutry)' : "poultry"}, inplace=True, regex=True)
    df2.content.replace({r'(paul?try)' : "poultry"}, inplace=True, regex=True)
    df2.content.replace({r'(a?diseas?\s)' : "disease "}, inplace=True, regex=True) 
    df2.content.replace({r'(a?diesease?s?)' : "disease"}, inplace=True, regex=True) 
    df2.content.replace({r'(a?di?e?i?seace?s?)' : "disease"}, inplace=True, regex=True) 
    df2.content.replace({r'(a?diss?easas?)' : "disease"}, inplace=True, regex=True)
    df2.content.replace({r'(a?dissease?s?)' : "disease"}, inplace=True, regex=True)
    df2.content.replace({r'(a?disease?d)' : "disease"}, inplace=True, regex=True)
    df2.content.replace({r'(a?dess?eas?e?s?)' : "disease"}, inplace=True, regex=True)
    df2.content.replace({r'(a?tomatos?e?s?)' : "tomato"}, inplace=True, regex=True) 
    df2.content.replace({r'(p?f?ass?h?ion\s?frui?ts?)' : "passionfruit"}, inplace=True, regex=True)
    df2.content.replace({r'(water\smelons?)' : "watermelon"}, inplace=True, regex=True)
    df2.content.replace({r'(a?ferte?i?lisers?)' : "fertilizer"}, inplace=True, regex=True)
    df2.content.replace({r'(a?fertillizers?)' : "fertilizer"}, inplace=True, regex=True)
    df2.content.replace({r'(ferte?r?lizers?)' : "fertilizer"}, inplace=True, regex=True)
    df2.content.replace({r'(a?live\s?stocks?)' : "livestock"}, inplace=True, regex=True)
    df2.content.replace({r'(a?livestokes?)' : "livestock"}, inplace=True, regex=True)
    df2.content.replace({r'(medice?ne?)' : "medicine"}, inplace=True, regex=True)
    df2.content.replace({r'(medicin\s)' : "medicine "}, inplace=True, regex=True)
    df2.content.replace({r'(a?potatos?e?s?)' : "potato"}, inplace=True, regex=True) 
    df2.content.replace({r'(swea?e?t\s?potatoe?s?)' : "sweetpotato"}, inplace=True, regex=True)
    df2.content.replace({r'(irish?potatoe?s?)' : "irish potato"}, inplace=True, regex=True) #dropping irish in stopwords
    df2.content.replace({r'(sugar\s?canes?)' : "sugarcane"}, inplace=True, regex=True) 
    df2.content.replace({r'(army\s?wo?a?rms?)' : "armyworm"}, inplace=True, regex=True)
    df2.content.replace({r'(dewo?a?rme?d?r?s?i?n?g?)' : "deworm"}, inplace=True, regex=True) 
    df2.content.replace({r'(egg plant)' : "eggplant"}, inplace=True, regex=True)
    df2.content.replace({r'(a?rabit)' : "rabbit"}, inplace=True, regex=True)
    df2.content.replace({r'(tobaco)' : "tobacco"}, inplace=True, regex=True)
    df2.content.replace({r'(mangoe)' : "mango"}, inplace=True, regex=True) #spacy detects both mangoes and mangos
    df2.content.replace({r'(agoat)' : "goat"}, inplace=True, regex=True)
    df2.content.replace({r'(acow)' : "cow"}, inplace=True, regex=True) 
    df2.content.replace({r'(a?fri?ei?shian)' : "friesian"}, inplace=True, regex=True)
    df2.content.replace({r'(a?frei?sian)' : "friesian"}, inplace=True, regex=True)
    df2.content.replace({r'(calfs)' : "calf"}, inplace=True, regex=True)
    df2.content.replace({r'(a?calve[^ds])' : "calf"}, inplace=True, regex=True)
    df2.content.replace({r'(a?pigs?)' : "pig"}, inplace=True, regex=True)
    df2.content.replace({r'(a?apiglets?)' : "piglet"}, inplace=True, regex=True)
    df2.content.replace({r'(a?turke?y)' : "turkey"}, inplace=True, regex=True)
    df2.content.replace({r'(a?sheep)' : "sheep"}, inplace=True, regex=True)
    df2.content.replace({r'(vacc?ina?t?e?d?i?o?n?g?)' : "vaccine"}, inplace=True, regex=True) 
    df2.content.replace({r'(cabage)' : "cabbage"}, inplace=True, regex=True)
    df2.content.replace({r'(a?ban?nan?na)' : "banana"}, inplace=True, regex=True) 
    df2.content.replace({r'(ovacc?ados?)' : "avocado"}, inplace=True, regex=True) 
    df2.content.replace({r'(avo?a?ccados?)' : "avocado"}, inplace=True, regex=True) 
    df2.content.replace({r'(macademia)' : "macadamia"}, inplace=True, regex=True) 

    return df2

In [14]:
proc_questions = pre_proc(questions)

In [15]:
proc_questions = questions_v2.to_csv('data/processed.csv')