# **Data Cleaning & EDA Practice** 
by Daniel Lee

The main purpose is to clean the data, not to make statistical analysis.

In [1]:
import pandas as pd
import numpy as np
import math
import nltk
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from autocorrect import Speller
import re

The data is stored in csvfile. 

## **Data Description**


In [2]:
csv_file_path = "../data/Food_choices/food_coded.csv"
data_ori = pd.read_csv(csv_file_path, low_memory=False)
data_cleaning = data_ori.copy()

In [3]:
# info function: including the index dtype and columns, non-null values and memory usage.
data_cleaning.info()
# data_cleaning.drop_duplicates() # check duplicate

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 61 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   GPA                           123 non-null    object 
 1   Gender                        125 non-null    int64  
 2   breakfast                     125 non-null    int64  
 3   calories_chicken              125 non-null    int64  
 4   calories_day                  106 non-null    float64
 5   calories_scone                124 non-null    float64
 6   coffee                        125 non-null    int64  
 7   comfort_food                  124 non-null    object 
 8   comfort_food_reasons          123 non-null    object 
 9   comfort_food_reasons_coded    106 non-null    float64
 10  cook                          122 non-null    float64
 11  comfort_food_reasons_coded.1  125 non-null    int64  
 12  cuisine                       108 non-null    float64
 13  diet_

In [4]:
# pandas.Dataframe.select_dtypes() function: Return a subset of the DataFrame’s columns based on the column dtypes.

obj_df = data_cleaning.select_dtypes(include=['object'])
num_df = data_cleaning.select_dtypes(exclude=['object'])

#helpfunction to seperate categorical and numerical features
def printColumnTypes(non_numeric_df, numeric_df):
    '''separates non-numeric and numeric columns'''
    print("Non-Numeric columns:")
    for col in non_numeric_df:
        print(f"{col}")
    print("")
    print("Numeric columns:")
    for col in numeric_df:
        print(f"{col}")

printColumnTypes(obj_df, num_df)

Non-Numeric columns:
GPA
comfort_food
comfort_food_reasons
diet_current
eating_changes
father_profession
fav_cuisine
food_childhood
healthy_meal
ideal_diet
meals_dinner_friend
mother_profession
type_sports
weight

Numeric columns:
Gender
breakfast
calories_chicken
calories_day
calories_scone
coffee
comfort_food_reasons_coded
cook
comfort_food_reasons_coded.1
cuisine
diet_current_coded
drink
eating_changes_coded
eating_changes_coded1
eating_out
employment
ethnic_food
exercise
father_education
fav_cuisine_coded
fav_food
fries
fruit_day
grade_level
greek_food
healthy_feeling
ideal_diet_coded
income
indian_food
italian_food
life_rewarding
marital_status
mother_education
nutritional_check
on_off_campus
parents_cook
pay_meal_out
persian_food
self_perception_weight
soup
sports
thai_food
tortilla_calories
turkey_calories
veggies_day
vitamins
waffle_calories


### There are a few problems for missing data method

* For example, by dropping rows/columns, you’re essentially losing information that might be useful for prediction

* On the other hand, imputing values will introduce bias to your data but it still might better than removing your features.

Here is a great analogy for this dilemma in this article by Elite Data Science.

Missing data is like missing a puzzle piece. If you drop it, that’s like pretending the puzzle slot isn’t there. If you impute it, that’s like trying to squeeze in a piece from somewhere else in the puzzle.

source:https://medium.com/bitgrit-data-science-publication/data-cleaning-with-python-f6bc3da64e45

In [5]:
missing_per_column = data_cleaning.isnull().sum()
# .sum() funciton return series(dataframe with one column; have different parameter from dataframe)

print(missing_per_column)

GPA                  2
Gender               0
breakfast            0
calories_chicken     0
calories_day        19
                    ..
type_sports         26
veggies_day          0
vitamins             0
waffle_calories      0
weight               2
Length: 61, dtype: int64


In [6]:
'''
shape function give give dimension of the dataframe which is [x,y]. length:x -> index = 0, width:y -> index = 1
Thus, we can use that info to find number of data and attributes.
'''
num_obs = np.product(data_ori.shape[0])
num_attr = np.product(data_ori.shape[1])
# This approach may not needed if we use info() instead but, who know? there might be an update in data

## Droping Feature

In [7]:
'''
Threshold for drop attributes would be 30% for big data and 20% for small data.
'''
# pandas.DataFrame.drop: Drop specified labels from rows or columns.

# pandas.DataFrame.iloc = for index, .loc = for column name
missing_percentage = (missing_per_column/num_obs)

#lt = less than, gt = greater than, le = less and equal, ge = greater and equal

feature_to_drop = missing_percentage[missing_percentage.ge(0.2)]
feature_to_keep = missing_percentage[missing_percentage.lt(0.2)]
#()parameter: subset of series, []parameter: index of subset

feature_to_drop_index = feature_to_drop.index
feature_to_keep_index = feature_to_keep.index

data_cleaning = data_cleaning[feature_to_keep_index]

This is much simpler way to drop column with certain threshold

In [8]:
# dropna(thresh): Require that many non-NA values.

data_cleaning2 = data_ori.copy()
thresh4data = len(data_cleaning2)*0.8
data_cleaning2 = data_cleaning2.dropna(axis =1, thresh=thresh4data)
# dropna() function returns a new DataFrame with missing values removed and does not modify the original DataFrame in place.
data_cleaning2 = data_cleaning2.dropna(axis =0, thresh= data_cleaning2.shape[1]*0.8)

When we use the dtypes function to distinguish between categorical and numerical data, we may find that some features that appear to be numerical are instead assigned to the categorical type.
We have observed that the features, GPA and Weight, have the potential to be represented as numerical data. However, the raw data in these features require cleaning to achieve this representation.

In [9]:
obj_df.head()

Unnamed: 0,GPA,comfort_food,comfort_food_reasons,diet_current,eating_changes,father_profession,fav_cuisine,food_childhood,healthy_meal,ideal_diet,meals_dinner_friend,mother_profession,type_sports,weight
0,2.4,none,we dont have comfort,eat good and exercise,eat faster,profesor,Arabic cuisine,rice and chicken,looks not oily,being healthy,"rice, chicken, soup",unemployed,car racing,187
1,3.654,"chocolate, chips, ice cream","Stress, bored, anger",I eat about three times a day with some snacks...,I eat out more than usual.,Self employed,Italian,"chicken and biscuits, beef soup, baked beans","Grains, Veggies, (more of grains and veggies),...",Try to eat 5-6 small meals a day. While trying...,"Pasta, steak, chicken",Nurse RN,Basketball,155
2,3.3,"frozen yogurt, pizza, fast food","stress, sadness","toast and fruit for breakfast, salad for lunch...",sometimes choosing to eat fast food instead of...,owns business,italian,"mac and cheese, pizza, tacos",usually includes natural ingredients; nonproce...,i would say my ideal diet is my current diet,"chicken and rice with veggies, pasta, some kin...",owns business,none,I'm not answering this.
3,3.2,"Pizza, Mac and cheese, ice cream",Boredom,"College diet, cheap and easy foods most nights...",Accepting cheap and premade/store bought foods,Mechanic,Turkish,"Beef stroganoff, tacos, pizza","Fresh fruits& vegetables, organic meats","Healthy, fresh veggies/fruits & organic foods",Grilled chicken \rStuffed Shells\rHomemade Chili,Special Education Teacher,,"Not sure, 240"
4,3.5,"Ice cream, chocolate, chips","Stress, boredom, cravings",I try to eat healthy but often struggle becaus...,I have eaten generally the same foods but I do...,IT,Italian,"Pasta, chicken tender, pizza","A lean protein such as grilled chicken, green ...",Ideally I would like to be able to eat healthi...,"Chicken Parmesan, Pulled Pork, Spaghetti and m...",Substance Abuse Conselor,Softball,190


To identify non-numerical data in a column, we can look for the unique values in the column.

In [10]:
print(obj_df['GPA'].unique())
print(obj_df['weight'].unique())

['2.4' '3.654' '3.3' '3.2' '3.5' '2.25' '3.8' '3.904' '3.4' '3.6' '3.1'
 nan '4' '2.2' '3.87' '3.7' '3.9' '2.8' '3' '3.65' '3.89' '2.9' '3.605'
 '3.83' '3.292' '3.35' 'Personal ' '2.6' '3.67' '3.73' '3.79 bitch' '2.71'
 '3.68' '3.75' '3.92' 'Unknown' '3.77' '3.63' '3.882']
['187' '155' "I'm not answering this. " 'Not sure, 240' '190' '180' '137'
 '125' '116' '110' '264' '123' '185' '145' '170' '135' '165' '175' '195'
 '105' '160' '167' '115' '205' nan '128' '150' '140' '120' '100' '113'
 '168' '169' '200' '265' '192' '118' '210' '112' '144 lbs' '130' '127'
 '129' '260' '184' '230' '138' '156']


In [11]:
# The drop() method will remove an entire row from a DataFrame when given a single index label, instead of removing a single value from a specific column.

# replace with correct format of data
data_cleaning['GPA'] = data_cleaning['GPA'].replace('3.79 bitch', 3.79)

# convert data into numeric, errors= 'coerce' change non-numeric to nan
data_cleaning['GPA'] = pd.to_numeric(data_cleaning['GPA'], errors ='coerce')
data_cleaning['weight'] = pd.to_numeric(data_cleaning['weight'],errors='coerce' )

obj_df = obj_df.drop(columns=['GPA', 'weight'],axis=1)
num_df = data_cleaning.select_dtypes(exclude=['object'])



object: texts, text values, or a mix of numeric and non-numeric values

For the column with object datatype can be change to categorical data



In [12]:
obj_columns = data_cleaning.select_dtypes(include="object").columns

# use lambda function tp convert it to lowercase and fix grammar 
data_cleaning[obj_columns] = data_cleaning[obj_columns].apply(lambda x: (x.str.lower()))

    
print(data_cleaning[obj_columns].head())
# print(data_cleaning['fav_cuisine'].unique())

                       comfort_food        comfort_food_reasons   
0                              none       we dont have comfort   \
1       chocolate, chips, ice cream        stress, bored, anger   
2   frozen yogurt, pizza, fast food             stress, sadness   
3  pizza, mac and cheese, ice cream                     boredom   
4      ice cream, chocolate, chips   stress, boredom, cravings    

                                        diet_current   
0                              eat good and exercise  \
1  i eat about three times a day with some snacks...   
2  toast and fruit for breakfast, salad for lunch...   
3  college diet, cheap and easy foods most nights...   
4  i try to eat healthy but often struggle becaus...   

                                      eating_changes father_profession   
0                                        eat faster          profesor   \
1                        i eat out more than usual.     self employed    
2  sometimes choosing to eat fast food

In the codebook, there are comments that some features are ideal for perform NLP(Nautral Language Processing). Thus, I used 3 features: comfort_food, comfort_food_reasons and diet_current

I used NLTK python library




## NLTK implementation (NLP)

In [13]:
## NOTE: Tools for NLTK package

tokenizer = TreebankWordTokenizer()
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer() 

In [14]:
'''packages to download '''

# nltk.download("stopwords")
# nltk.download("averaged_perceptron_tagger")
# nltk.download("maxent_ne_chunker")
# nltk.download("words")

'packages to download '

https://www.ars.usda.gov/ARSUserFiles/80400530/pdf/1112/food_category_list.pdf


select the data that we are going to apply NLP

In [15]:

data_nlp = data_cleaning[['comfort_food', 'comfort_food_reasons', 'diet_current']]

data_nlp= data_nlp.astype(str)

data_nlp_prev = data_nlp ## copies of orginal data

### Chunk Function

In [16]:
'''This is main function that chunk the sentence into word with POS tag using parser. It will also create tree that shows structure of sentence'''
def chunk_NP(text, origin=True):    
    grammar =  r'''
    NP: {<FT><NN>|<RB>?<NN.*>+|<NN><CC><NN>|<DT>?<NN|NNS>+<POS>?}
    '''

    '''
    The rule states that whenever the chunk finds an optional determiner (DT) followed by any number of adjectives (JJ) and then a noun (NN) then the Noun Phrase(NP) chunk should be formed.
    '''
    
    if type(text) is not str:
        return ['none']
    else:    
        
        tokens = nltk.word_tokenize(text)
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        lotr_pos_tags = nltk.pos_tag(tokens)
        chunk_parser = nltk.RegexpParser(grammar)
        tree = chunk_parser.parse(lotr_pos_tags)

        noun_phrases = []
        for subtree in tree.subtrees():
            if subtree.label() =='NP':
                np_parts = []
                for leaf in subtree.leaves():
                    np_parts.append(leaf[0])
                noun_phrases.append(" ".join(np_parts))

        
        return noun_phrases


In [17]:
def get_Tree(text):
    grammar =  r'''
    NP: {<NN>|<RB>?<NN.*>+|<NN><CC><NN>|<DT>?<NN|NNS>+<POS>?}
    '''
    
    
    '''RB for extract 'and'
    '''

    tokens = nltk.word_tokenize(text)
    lotr_pos_tags = nltk.pos_tag(tokens)
    chunk_parser = nltk.RegexpParser(grammar)
    tree = chunk_parser.parse(lotr_pos_tags)

    return tree

In [18]:
comfort_food_col1 = []
comfort_food_col2 = []


for x in range(len(data_nlp["comfort_food"])):
    a = chunk_NP(data_nlp.loc[x,'comfort_food'])
    comfort_food_col1.append(a)

    b = chunk_NP(data_nlp.loc[x, 'comfort_food_reasons'])
    comfort_food_col2.append(b)

flat_list_1st = [item for sublist in comfort_food_col1 for item in sublist]

print(sorted(set(flat_list_1st))) #TODO check the output


['/', 'almond', 'any kind', 'banana sandwich', 'beef jerky', 'bread', 'bread/crackers', 'broccoli', 'brownie', 'burger', 'burrito', 'butter', 'butter naan', 'cake', 'candy', 'candy bar', 'candy pop chocolate chipotle moe', 'carrot', 'cereal', 'cheese', 'cheeseburger', 'cheesecake', 'cheez-its', 'chex-mix', 'chicken', 'chicken curry', 'chicken finger', 'chicken nugget', 'chicken nuggs', 'chicken wing', 'chilli', 'chip', 'chocolate', 'chocolate bar', 'chocolate brownie', 'chocolate ice cream', 'coffee', 'cookie dough', 'cooky', 'cornbread', 'cottage cheese', 'cup', 'deli sandwhich', 'dessets', 'dip', 'dish', 'donut', 'doritos', 'doughnut', 'egg', 'fast food', 'fire', 'food', 'fritos', 'fruit', 'fruit snack', 'fry', 'grandma', 'grandma homemade chocolate cake anything homemade', 'grape', 'hamburger', 'home', 'ice capps', 'ice crea', 'ice cream', 'ice cream/milkshake', 'ice-cream', 'icecream', 'kit kat', 'lasagna', 'lasagne', 'mac', 'macaroni', 'macaroon', 'mcdonalds', 'meatball sub', 'mil

### These are the results from the unique value from comfort_food:

['/', 'almond', 'any kind', 'banana sandwich', 'beef jerky', 'bread', 'bread/crackers', 'broccoli', 'brownie', 'burger', 'burrito', 'butter', 'butter naan', 'cake', 'candy', 'candy bar', 'candy pop chocolate chipotle moe', 'carrot', 'cereal', 'cheese', 'cheeseburger', 'cheesecake', 'cheez-its', 'chex-mix', 'chicken', 'chicken curry', 'chicken finger', 'chicken nugget', 'chicken nuggs', 'chicken wing', 'chilli', 'chip', 'chocolate', 'chocolate bar', 'chocolate brownie', 'chocolate ice cream', 'coffee', 'cookie dough', 'cooky', 'cornbread', 'cottage cheese', 'cup', 'deli sandwhich', 'dessets', 'dip', 'dish', 'donut', 'doritos', 'doughnut', 'egg', 'fast food', 'fire', 'food', 'fritos', 'fruit', 'fruit snack', 'fry', 'grandma', 'grandma homemade chocolate cake anything homemade', 'grape', 'hamburger', 'home', 'ice capps', 'ice crea', 'ice cream', 'ice cream/milkshake', 'ice-cream', 'icecream', 'kit kat', 'lasagna', 'lasagne', 'mac', 'macaroni', 'macaroon', 'mcdonalds', 'meatball sub', 'milkshake', 'mix', 'moes', 'mozzarella stick', 'nan', 'none', 'noodle', 'nugget', 'nutella', 'omelet', 'pasta', 'peanut butter', 'peanut butter sandwich', 'pepper', 'pepsi', 'pierogies', 'pizza', 'pizza chocolate chip', 'pizza cooky steak', 'plantain chip', 'pop', 'popcorn', 'pot pie', 'potato', 'potato chip', 'potato soup', 'pretzals', 'pretzel', 'protein bar', 'quinoa', 'ranch', 'reese', 'rice', 'ritz', 'salsa', 'salt', 'salty snack', 'slim jims', 'snack', 'soda', 'soup', 'spaghetti', 'sponge candy', 'squash', 'sub', 'sushi', 'sweet', 'terra chip', 'tikka masala', 'toast', 'tomato soup', 'truffle', 'tuna sandwich', 'twizzlers', 'vinegar chip', 'watermelon', 'wine', 'wing', 'yogurt']



#### I manually removed certain words from the list that were not appropriate for describing food.


In [19]:
list_keep_1st = ['almond', 'banana sandwich', 'beef jerky', 'bread',  'broccoli', 'brownie', 'burger', 'burrito', 'butter', 'butter naan', 'cake', 'candy', 'candy bar','carrot', 'cereal', 'cheese', 'cheeseburger', 'cheesecake', 'cheez-its', 'chex-mix', 'chicken', 'chicken curry', 'chicken finger', 'chicken nugget', 'chicken wing',  'chip', 'chocolate', 'chocolate bar', 'chocolate brownie', 'chocolate ice cream', 'coffee', 'cookie dough', 'cooky', 'cornbread', 'cottage cheese', 'deli sandwhich', 'doritos', 'doughnut', 'egg', 'fast food', 'fritos', 'fruit', 'fruit snack', 'fry', 'grape', 'hamburger', 'ice capps', 'ice cream', 'kit kat', 'lasagna', 'macaroni', 'macaroon', 'mcdonalds', 'meatball sub', 'milkshake', 'moes', 'mozzarella stick', 'none', 'noodle', 'nugget', 'nutella', 'omelet', 'pasta', 'peanut butter', 'peanut butter sandwich', 'pepper', 'pepsi', 'pierogies', 'pizza', 'pizza chocolate chip', 'pizza cooky steak', 'plantain chip', 'popcorn', 'pot pie', 'potato', 'potato chip', 'potato soup', 'pretzel', 'protein bar', 'quinoa', 'ranch', 'reese', 'rice', 'ritz', 'salsa', 'salty snack', 'slim jims', 'snack', 'soda', 'soup', 'spaghetti', 'sponge candy', 'squash', 'sub', 'sushi', 'terra chip', 'tikka masala', 'toast', 'tomato soup', 'truffle', 'tuna sandwich', 'twizzlers', 'vinegar chip', 'watermelon', 'wine', 'wing', 'yogurt']


list_remove_1st = list(set(flat_list_1st)-set(list_keep_1st))
print(list_remove_1st) ##TODO Check what words werer sorted out 


['home', 'donut', 'pop', 'chilli', 'lasagne', '/', 'pretzals', 'ice cream/milkshake', 'ice crea', 'mix', 'dessets', 'sweet', 'any kind', 'dish', 'food', 'fire', 'ice-cream', 'bread/crackers', 'cup', 'candy pop chocolate chipotle moe', 'nan', 'salt', 'icecream', 'dip', 'mac', 'chicken nuggs', 'grandma', 'grandma homemade chocolate cake anything homemade']


In [20]:
def find_sentences_with_word(sentences, target_words):
    matched_sentences = []
    count =0
    for target_word in target_words:
        pattern = r'\b{}\b'.format(re.escape(target_word))
        matched_sentence = [sentence for sentence in sentences if re.search(pattern, sentence, re.IGNORECASE)]
        matched_sentences.append(matched_sentence)
        count = count+1
        print(target_word, matched_sentence, ": \n")
    return matched_sentences

In [21]:
import difflib

def find_sentences_with_word(sentences, target_words, threshold=0.85):
    matched_sentences = []
    
    for target_word in target_words:
        matched_sentence = []
        
        for sentence in sentences:
            words = re.findall(r'\w+', sentence.lower())
            
            for word in words:
                similarity = difflib.SequenceMatcher(None, word, target_word).ratio()
                
                if similarity >= threshold:
                    matched_sentence.append(sentence)
                    break  # No need to check further words in the same sentence
                
        matched_sentences.append(matched_sentence)
        print(target_word, matched_sentence, ": \n")

    return matched_sentences

In [22]:
print(data_nlp.loc[39, 'comfort_food'])
print(data_nlp.loc[117, 'comfort_food'])


moe's le  
noodle ( any kinds of noodle), tuna sandwich, and egg.


In [23]:
old_value = "candy\rpop\rchocolate \rchipotle \rmoe's "
new_value = "candy pop chocolate chipotle moe's "

data_nlp['comfort_food'] = data_nlp['comfort_food'].replace(old_value, new_value)

print(data_nlp.loc[39, 'comfort_food'])

candy pop chocolate chipotle moe's 


In [24]:
avadacadabra = find_sentences_with_word(data_nlp['comfort_food'], list_remove_1st)


home ["grandma's chinese, peruvian food from back home, and sushi"] : 

donut ['donuts, ice cream, chips', 'cookies, donuts, candy bars', 'little debbie snacks, donuts, pizza'] : 

pop ["candy pop chocolate chipotle moe's ", 'ice cream, cake, pop, pizza, and milkshakes.'] : 

chilli ['chilli, soup, pot pie'] : 

lasagne ['chinese food, moes, sponge candy, homemade lasagne ', 'mac and cheese, lasagna, chinese food ', 'mac n cheese, lasagna, pizza'] : 

/ [] : 

pretzals ['chocolate, ice cream, french fries, pretzels', 'peanut butter sandwich, pretzals, garlic bread', 'peanut butter, dessets, pretzels. ', 'carrots and ranch, pretzels, dark chocolate ', 'pizza, pretzels, fruit snacks, deli sandwhich', 'chocolate bar, ice cream, pretzels, potato chips and protein bars.'] : 

ice cream/milkshake [] : 

ice crea ['chocolate, popcorn, icecream'] : 

mix ['chex-mix, wegmans cookies, cheez-its ', 'popcorn, chex mix, pizza'] : 

dessets ['peanut butter, dessets, pretzels. '] : 

sweet ['chocolat

Here's list of problem of unconsistant string format:

1. no comma between items
2. sepeated by slas or dash instead of comma
3. wrong spelling
4. too broad name (such as food, salt)
5. name with mark

chinese food, korean food 
mac and cheese vs mac n cheese


## 1st Modification after 1st output

Check list:

1. no comma between items
2. sepeated by slas or dash instead of comma -> done using regular expression
3. wrong spelling
4. too broad name (such as food, salt)
5. name with mark

In [25]:
def get_Tree(text):
    grammar =  r'''
    NP: {<custom_tags_foodtypes><NN>|<RB>?<NN.*>+|<NN><CC><NN>|<DT>?<NN|NNS>+<POS>?}
    '''

    custom_tags_foodtypes = ['chinese', 'korean']
    custom_tags_waste = ['grandma', 'homemade']

    tokens = nltk.word_tokenize(text)

    ## FT stand for food type
    lotr_pos_tags = [
        (word, custom_tags_foodtypes) if word in custom_tags_foodtypes
    else (word, tag) for word, tag in nltk.pos_tag(tokens)]
                 
    # lotr_pos_tags = nltk.pos_tag(tokens) ## previous version


    chunk_parser = nltk.RegexpParser(grammar)
    tree = chunk_parser.parse(lotr_pos_tags)

    return tree

In [26]:
'''This is an additional function I developed to sanitize informal language. Given my expertise, I couldn't come up with a more suitable alternative.'''

def informal2formal(text):

    text = remove_nonaplpha(text)
    formal_tokens = []
    informal_to_formal = {

        "n" : "and",  
        "crea" : "cream", # typo
        'egg.' : "egg", # typo
        'fire' : 'fry',
        'icecream' : 'ice cream'
        # "\r" : "," # input format : multiline cell
    }

    tokens = nltk.word_tokenize(text)

    for token in tokens:
        if informal_to_formal.get(token) is not None:
            formal_tokens.append(informal_to_formal.get(token))
        else:
            formal_tokens.append(token)
    
    ## convert plural to singular
    formal_tokens = [lemmatizer.lemmatize(word)for word in formal_tokens]

    return formal_tokens
 


In [27]:
import re
def remove_nonaplpha(text):
    # Remove any non-alphabetic characters except for hyphens and spaces
    text = re.sub(r'[^a-zA-Z\s\,\/-]', ' ', text)
    # Replace multiple spaces or hyphens with a single space
    text = re.sub(r'[\s]+', ' ', text)
    # Replace hyphens with a space
    text = text.replace('-', ' ')
    ## Replace foward slash with a comma
    text = text.replace('/', ', ')
    # Trim leading and trailing spaces
    text = text.strip()
    
    return text.lower()


In [28]:
''' This is main function that chunk the sentence into word with POS tag using parser. It will also create tree that shows structure of sentence '''
def chunk_NP(text):
    grammar =  r'''
    NP: { <RB>?<NN>*|<NN><CC><NN>|<DT>?<JJ>*<NN|NNS>+|}
    '''

    '''
    The rule states that whenever the chunk finds an optional determiner (DT) followed by any number of adjectives (JJ) and then a noun (NN) then the Noun Phrase(NP) chunk should be formed.
    '''


    if type(text) is not str:
        return ['none']
    else:    
        text = informal2formal(text)
            
        lotr_pos_tags = nltk.pos_tag(text)
        chunk_parser = nltk.RegexpParser(grammar)
        tree = chunk_parser.parse(lotr_pos_tags)

        noun_phrases = []
        for subtree in tree.subtrees():
            if subtree.label() =='NP':
                np_parts = []
                for leaf in subtree.leaves():
                    np_parts.append(leaf[0])
                noun_phrases.append(" ".join(np_parts))
        return noun_phrases


In [29]:

comfort_food_col1 = []
comfort_food_col2 = []


for x in range(len(data_nlp["comfort_food"])):
    a = chunk_NP(data_nlp.loc[x,'comfort_food'])
    comfort_food_col1.append(a)

    b = chunk_NP(data_nlp.loc[x, 'comfort_food_reasons'])
    comfort_food_col2.append(b)

flat_list_2nd = [item for sublist in comfort_food_col1 for item in sublist]

# print(set(flat_list_2nd)) #TODO check the output


list_remove_2nd = set(flat_list_2nd).difference(list_keep_1st)
print(list_remove_2nd) ##TODO Check what words werer sorted out 

{'little debbie snack', 'mac cheese', 'chex mix', 'slim', 'microwaveable food', 'peruvian food', 'dessets', 'sweet', 'french fry', 'korean food', 'nan', 'hot chocolate', 'wegmans', 'donut', 'frozen yogurt', 'sweet popcorn', 'lasagne', 'jims', 'pretzals', 'reese s cup dark chocolate', 'salt', 'noodle soup', 'pasta dish', 'pop', 'cracker', 'stuffed pepper', 'grandma s', 'cheese chip', 'chinese food', 'dip', 'wine mac', 'dark chocolate', 'grandma homemade chocolate cake anything homemade', 'back home', 'spaghetti squash', 'chilli', 'salsa ice cream', 'bagel ice capps', 'garlic bread', 'any kind', 'candy pop chocolate chipotle moe s', 'nuggs', 'french fire', 'mac'}


In [30]:
# print(set(flat_list_2nd).difference(set(flat_list_1st)))
print(set(flat_list_2nd)-set(flat_list_1st))

{'back home', 'spaghetti squash', 'frozen yogurt', 'sweet popcorn', 'little debbie snack', 'mac cheese', 'chex mix', 'salsa ice cream', 'slim', 'microwaveable food', 'jims', 'cracker', 'bagel ice capps', 'stuffed pepper', 'peruvian food', 'garlic bread', 'reese s cup dark chocolate', 'grandma s', 'candy pop chocolate chipotle moe s', 'nuggs', 'cheese chip', 'french fire', 'french fry', 'korean food', 'chinese food', 'noodle soup', 'wine mac', 'dark chocolate', 'hot chocolate', 'pasta dish', 'wegmans'}


In [31]:
for x in comfort_food_col1:
    print(str(x))

['none']
['chocolate', 'chip', 'ice cream']
['frozen yogurt', 'pizza', 'fast food']
['pizza', 'mac', 'cheese', 'ice cream']
['ice cream', 'chocolate', 'chip']
['candy', 'brownie', 'soda']
['chocolate', 'ice cream', 'french fry', 'pretzel']
['ice cream', 'cheeseburger', 'chip']
['donut', 'ice cream', 'chip']
['mac', 'cheese', 'chocolate', 'pasta']
['pasta', 'grandma homemade chocolate cake anything homemade']
['chocolate', 'pasta', 'soup', 'chip', 'popcorn']
['cooky', 'popcorn', 'chip']
['ice cream', 'cake', 'chocolate']
['pizza', 'fruit', 'spaghetti', 'chicken', 'potato']
['cooky', 'donut', 'candy bar']
['candy', 'kit kat']
['chip', 'cooky', 'ice cream']
['chocolate', 'ice cream']
['pizza', 'wing']
['fast food', 'pizza', 'sub']
['chocolate', 'sweet', 'ice cream']
['burger', 'chip', 'cooky']
['chilli', 'soup', 'pot pie']
['soup', 'pasta', 'brownie']
['chocolate', 'ice cream', 'milkshake', 'cooky']
['chip', 'ice cream', 'microwaveable food']
['chicken finger', 'pizza']
['cooky', 'hot cho

In [32]:

avadacadabra = find_sentences_with_word(data_nlp['comfort_food'], list_remove_1st)

count =0
for x in avadacadabra:
    count = count+1
    print(x, count)

home ["grandma's chinese, peruvian food from back home, and sushi"] : 

donut ['donuts, ice cream, chips', 'cookies, donuts, candy bars', 'little debbie snacks, donuts, pizza'] : 

pop ["candy pop chocolate chipotle moe's ", 'ice cream, cake, pop, pizza, and milkshakes.'] : 

chilli ['chilli, soup, pot pie'] : 

lasagne ['chinese food, moes, sponge candy, homemade lasagne ', 'mac and cheese, lasagna, chinese food ', 'mac n cheese, lasagna, pizza'] : 

/ [] : 

pretzals ['chocolate, ice cream, french fries, pretzels', 'peanut butter sandwich, pretzals, garlic bread', 'peanut butter, dessets, pretzels. ', 'carrots and ranch, pretzels, dark chocolate ', 'pizza, pretzels, fruit snacks, deli sandwhich', 'chocolate bar, ice cream, pretzels, potato chips and protein bars.'] : 

ice cream/milkshake [] : 

ice crea ['chocolate, popcorn, icecream'] : 

mix ['chex-mix, wegmans cookies, cheez-its ', 'popcorn, chex mix, pizza'] : 

dessets ['peanut butter, dessets, pretzels. '] : 

sweet ['chocolat

In [33]:
data_cleaning.loc[:,['comfort_food','comfort_food_reasons', 'diet_current']] = data_nlp


In [34]:
cf_explode = data_cleaning.explode('comfort_food') 
## The explode() function is used to transform each element of a list-like to a row, replicating the index values.

print(cf_explode['comfort_food'].unique())
print(cf_explode['comfort_food'].unique().shape)
print(cf_explode['comfort_food'].shape)

['none' 'chocolate, chips, ice cream' 'frozen yogurt, pizza, fast food'
 'pizza, mac and cheese, ice cream' 'ice cream, chocolate, chips '
 'candy, brownies and soda.'
 'chocolate, ice cream, french fries, pretzels'
 'ice cream, cheeseburgers, chips.' 'donuts, ice cream, chips'
 'mac and cheese, chocolate, and pasta '
 'pasta, grandma homemade chocolate cake anything homemade '
 'chocolate, pasta, soup, chips, popcorn' 'cookies, popcorn, and chips'
 'ice cream, cake, chocolate'
 'pizza, fruit, spaghetti, chicken and potatoes  '
 'cookies, donuts, candy bars' 'saltfish, candy and kit kat '
 'chips, cookies, ice cream' 'chocolate, ice crea '
 'pizza, wings, chinese' 'fast food, pizza, subs'
 'chocolate, sweets, ice cream' 'burgers, chips, cookies'
 'chilli, soup, pot pie' 'soup, pasta, brownies, cake'
 'chocolate, ice cream/milkshake, cookies'
 'chips, ice cream, microwaveable foods ' 'chicken fingers, pizza '
 'cookies, hot chocolate, beef jerky'
 'tomato soup, pizza, fritos, meatball s

grandma homemade chocolate cake anything homemade
ice crea

In [35]:
string = "Pizza / Wings / Cheesecake".lower()


string2 = "grandma homemade chocolate cake anything homemade".lower()

string3 = "Saltfish, Candy and Kit Kat "

string4 = "chocolate,ice cream/milkshake,cooky"
# print(chunk_NP(string2))
get_Tree(string2).draw()
# print(nltk.word_tokenize(string2))
print(informal2formal(string))

['pizza', ',', 'wing', ',', 'cheesecake']


In [36]:
from autocorrect import Speller


spell = Speller(lang='en')
text = "ice cream/milkshake"
corrected_text = spell(text)

print(corrected_text)

ice cream/milkshake


In [37]:
print(data_nlp.loc[39,'comfort_food'])


candy pop chocolate chipotle moe's 
