# **Data Cleaning & EDA Practice**
### by Daniel Lee

The main purpose is to clean the data, not to make statistical analysis.

In [81]:
import pandas as pd
import numpy as np
import math
import nltk
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from autocorrect import Speller

* our data is stored in csvfile. 

## **Data Description**


In [82]:
data_ori = pd.read_csv("../data/Food_choices/food_coded.csv", low_memory=False)
data_cleaning = data_ori.copy()

In [83]:
# info function: including the index dtype and columns, non-null values and memory usage.
data_cleaning.info()
# data_cleaning.drop_duplicates() # check duplicate

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 61 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   GPA                           123 non-null    object 
 1   Gender                        125 non-null    int64  
 2   breakfast                     125 non-null    int64  
 3   calories_chicken              125 non-null    int64  
 4   calories_day                  106 non-null    float64
 5   calories_scone                124 non-null    float64
 6   coffee                        125 non-null    int64  
 7   comfort_food                  124 non-null    object 
 8   comfort_food_reasons          123 non-null    object 
 9   comfort_food_reasons_coded    106 non-null    float64
 10  cook                          122 non-null    float64
 11  comfort_food_reasons_coded.1  125 non-null    int64  
 12  cuisine                       108 non-null    float64
 13  diet_

In [84]:
# pandas.Dataframe.select_dtypes() function: Return a subset of the DataFrame’s columns based on the column dtypes.

obj_df = data_cleaning.select_dtypes(include=['object'])
num_df = data_cleaning.select_dtypes(exclude=['object'])

#helpfunction to seperate categorical and numerical features
def printColumnTypes(non_numeric_df, numeric_df):
    '''separates non-numeric and numeric columns'''
    print("Non-Numeric columns:")
    for col in non_numeric_df:
        print(f"{col}")
    print("")
    print("Numeric columns:")
    for col in numeric_df:
        print(f"{col}")

printColumnTypes(obj_df, num_df)

Non-Numeric columns:
GPA
comfort_food
comfort_food_reasons
diet_current
eating_changes
father_profession
fav_cuisine
food_childhood
healthy_meal
ideal_diet
meals_dinner_friend
mother_profession
type_sports
weight

Numeric columns:
Gender
breakfast
calories_chicken
calories_day
calories_scone
coffee
comfort_food_reasons_coded
cook
comfort_food_reasons_coded.1
cuisine
diet_current_coded
drink
eating_changes_coded
eating_changes_coded1
eating_out
employment
ethnic_food
exercise
father_education
fav_cuisine_coded
fav_food
fries
fruit_day
grade_level
greek_food
healthy_feeling
ideal_diet_coded
income
indian_food
italian_food
life_rewarding
marital_status
mother_education
nutritional_check
on_off_campus
parents_cook
pay_meal_out
persian_food
self_perception_weight
soup
sports
thai_food
tortilla_calories
turkey_calories
veggies_day
vitamins
waffle_calories


### There are a few problems for missing data method

* For example, by dropping rows/columns, you’re essentially losing information that might be useful for prediction

* On the other hand, imputing values will introduce bias to your data but it still might better than removing your features.

Here is a great analogy for this dilemma in this article by Elite Data Science.

Missing data is like missing a puzzle piece. If you drop it, that’s like pretending the puzzle slot isn’t there. If you impute it, that’s like trying to squeeze in a piece from somewhere else in the puzzle.

source:https://medium.com/bitgrit-data-science-publication/data-cleaning-with-python-f6bc3da64e45

In [85]:
missing_per_column = data_cleaning.isnull().sum()
# .sum() funciton return series(dataframe with one column; have different parameter from dataframe)

print(missing_per_column)

GPA                  2
Gender               0
breakfast            0
calories_chicken     0
calories_day        19
                    ..
type_sports         26
veggies_day          0
vitamins             0
waffle_calories      0
weight               2
Length: 61, dtype: int64


In [86]:
'''
shape function give give dimension of the dataframe which is [x,y]. length:x -> index = 0, width:y -> index = 1
Thus, we can use that info to find number of data and attributes.
'''
num_obs = np.product(data_ori.shape[0])
num_attr = np.product(data_ori.shape[1])
# This approach may not needed if we use info() instead but, who know? there might be an update in data

### Droping Feature

In [87]:
'''
Threshold for drop attributes would be 30% for big data and 20% for small data.
'''
# pandas.DataFrame.drop: Drop specified labels from rows or columns.

# pandas.DataFrame.iloc = for index, .loc = for column name
missing_percentage = (missing_per_column/num_obs)

#lt = less than, gt = greater than, le = less and equal, ge = greater and equal

feature_to_drop = missing_percentage[missing_percentage.ge(0.2)]
feature_to_keep = missing_percentage[missing_percentage.lt(0.2)]
#()parameter: subset of series, []parameter: index of subset

feature_to_drop_index = feature_to_drop.index
feature_to_keep_index = feature_to_keep.index

data_cleaning = data_cleaning[feature_to_keep_index]

This is much simpler way to drop column with certain threshold

In [88]:
# dropna(thresh): Require that many non-NA values.

data_cleaning2 = data_ori.copy()
thresh4data = len(data_cleaning2)*0.8
data_cleaning2 = data_cleaning2.dropna(axis =1, thresh=thresh4data)
# dropna() function returns a new DataFrame with missing values removed and does not modify the original DataFrame in place.
data_cleaning2 = data_cleaning2.dropna(axis =0, thresh= data_cleaning2.shape[1]*0.8)

When we use the dtypes function to distinguish between categorical and numerical data, we may find that some features that appear to be numerical are instead assigned to the categorical type.
We have observed that the features, GPA and Weight, have the potential to be represented as numerical data. However, the raw data in these features require cleaning to achieve this representation.

In [89]:
obj_df.head()

Unnamed: 0,GPA,comfort_food,comfort_food_reasons,diet_current,eating_changes,father_profession,fav_cuisine,food_childhood,healthy_meal,ideal_diet,meals_dinner_friend,mother_profession,type_sports,weight
0,2.4,none,we dont have comfort,eat good and exercise,eat faster,profesor,Arabic cuisine,rice and chicken,looks not oily,being healthy,"rice, chicken, soup",unemployed,car racing,187
1,3.654,"chocolate, chips, ice cream","Stress, bored, anger",I eat about three times a day with some snacks...,I eat out more than usual.,Self employed,Italian,"chicken and biscuits, beef soup, baked beans","Grains, Veggies, (more of grains and veggies),...",Try to eat 5-6 small meals a day. While trying...,"Pasta, steak, chicken",Nurse RN,Basketball,155
2,3.3,"frozen yogurt, pizza, fast food","stress, sadness","toast and fruit for breakfast, salad for lunch...",sometimes choosing to eat fast food instead of...,owns business,italian,"mac and cheese, pizza, tacos",usually includes natural ingredients; nonproce...,i would say my ideal diet is my current diet,"chicken and rice with veggies, pasta, some kin...",owns business,none,I'm not answering this.
3,3.2,"Pizza, Mac and cheese, ice cream",Boredom,"College diet, cheap and easy foods most nights...",Accepting cheap and premade/store bought foods,Mechanic,Turkish,"Beef stroganoff, tacos, pizza","Fresh fruits& vegetables, organic meats","Healthy, fresh veggies/fruits & organic foods",Grilled chicken \rStuffed Shells\rHomemade Chili,Special Education Teacher,,"Not sure, 240"
4,3.5,"Ice cream, chocolate, chips","Stress, boredom, cravings",I try to eat healthy but often struggle becaus...,I have eaten generally the same foods but I do...,IT,Italian,"Pasta, chicken tender, pizza","A lean protein such as grilled chicken, green ...",Ideally I would like to be able to eat healthi...,"Chicken Parmesan, Pulled Pork, Spaghetti and m...",Substance Abuse Conselor,Softball,190


To identify non-numerical data in a column, we can look for the unique values in the column.

In [90]:
print(obj_df['GPA'].unique())
print(obj_df['weight'].unique())

['2.4' '3.654' '3.3' '3.2' '3.5' '2.25' '3.8' '3.904' '3.4' '3.6' '3.1'
 nan '4' '2.2' '3.87' '3.7' '3.9' '2.8' '3' '3.65' '3.89' '2.9' '3.605'
 '3.83' '3.292' '3.35' 'Personal ' '2.6' '3.67' '3.73' '3.79 bitch' '2.71'
 '3.68' '3.75' '3.92' 'Unknown' '3.77' '3.63' '3.882']
['187' '155' "I'm not answering this. " 'Not sure, 240' '190' '180' '137'
 '125' '116' '110' '264' '123' '185' '145' '170' '135' '165' '175' '195'
 '105' '160' '167' '115' '205' nan '128' '150' '140' '120' '100' '113'
 '168' '169' '200' '265' '192' '118' '210' '112' '144 lbs' '130' '127'
 '129' '260' '184' '230' '138' '156']


In [91]:
# The drop() method will remove an entire row from a DataFrame when given a single index label, instead of removing a single value from a specific column.

# replace with correct format of data
data_cleaning['GPA'] = data_cleaning['GPA'].replace('3.79 bitch', 3.79)

# convert data into numeric, errors= 'coerce' change non-numeric to nan
data_cleaning['GPA'] = pd.to_numeric(data_cleaning['GPA'], errors ='coerce')
data_cleaning['weight'] = pd.to_numeric(data_cleaning['weight'],errors='coerce' )

obj_df = obj_df.drop(columns=['GPA', 'weight'],axis=1)
num_df = data_cleaning.select_dtypes(exclude=['object'])



object: texts, text values, or a mix of numeric and non-numeric values

For the column with object datatype can be change to categorical data



In [92]:
obj_columns = data_cleaning.select_dtypes(include="object").columns

# use lambda function tp convert it to lowercase and fix grammar 
data_cleaning[obj_columns] = data_cleaning[obj_columns].apply(lambda x: (x.str.lower()))

    
print(data_cleaning[obj_columns].head())
# print(data_cleaning['fav_cuisine'].unique())

                       comfort_food        comfort_food_reasons   
0                              none       we dont have comfort   \
1       chocolate, chips, ice cream        stress, bored, anger   
2   frozen yogurt, pizza, fast food             stress, sadness   
3  pizza, mac and cheese, ice cream                     boredom   
4      ice cream, chocolate, chips   stress, boredom, cravings    

                                        diet_current   
0                              eat good and exercise  \
1  i eat about three times a day with some snacks...   
2  toast and fruit for breakfast, salad for lunch...   
3  college diet, cheap and easy foods most nights...   
4  i try to eat healthy but often struggle becaus...   

                                      eating_changes father_profession   
0                                        eat faster          profesor   \
1                        i eat out more than usual.     self employed    
2  sometimes choosing to eat fast food

In the codebook, there are comments that some features are ideal for perform NLP(Nautral Language Processing). Thus, I used 3 features: comfort_food, comfort_food_reasons and diet_current

I used NLTK python library




In [93]:
tokenizer = TreebankWordTokenizer()
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer() 

In [94]:
'''packages to download '''

# nltk.download("stopwords")
# nltk.download("averaged_perceptron_tagger")
# nltk.download("maxent_ne_chunker")
# nltk.download("words")

'packages to download '

https://www.ars.usda.gov/ARSUserFiles/80400530/pdf/1112/food_category_list.pdf


extract the data that we are going to use NLP

In [95]:
data_nlp = data_cleaning[['comfort_food', 'comfort_food_reasons', 'diet_current']]

data_nlp_prev = data_nlp ## copies of orginal data

## chunk function

In [96]:
'''This is main function that chunk the sentence into word with POS tag using parser. It will also create tree that shows structure of sentence'''
def chunk_NP(text, origin=True):    
    grammar =  r'''
    NP: {<RB>?<NN.*>+|<VBZ><NN>|<NN><CC><NN>|<DT>?<NN|NNS>+}
    '''

    '''
    The rule states that whenever the chunk finds an optional determiner (DT) followed by any number of adjectives (JJ) and then a noun (NN) then the Noun Phrase(NP) chunk should be formed.
    '''
    
    if type(text) is not str:
        return ['none']
    else:    
        
        tokens = nltk.word_tokenize(text)
        lotr_pos_tags = nltk.pos_tag(tokens)
        chunk_parser = nltk.RegexpParser(grammar)
        tree = chunk_parser.parse(lotr_pos_tags)

        noun_phrases = []
        for subtree in tree.subtrees():
            if subtree.label() =='NP':
                np_parts = []
                for leaf in subtree.leaves():
                    np_parts.append(leaf[0])
                noun_phrases.append(" ".join(np_parts))
        return noun_phrases


In [97]:
def get_Tree(text):
    grammar =  r'''
    NP: {<RB>?<NN.*>+|<VBZ><NN|NNS>|<NN|NNS><CC><NN|NNS>|<DT>?<NN|NNS>+}
    '''
    
    
    '''RB to extract 'and'
    '''

    tokens = nltk.word_tokenize(text)
    lotr_pos_tags = nltk.pos_tag(tokens)
    chunk_parser = nltk.RegexpParser(grammar)
    tree = chunk_parser.parse(lotr_pos_tags)

    return tree

In [98]:
new_column = []
new_column2 = []

for x in range(len(data_nlp["comfort_food"])):

    a = chunk_NP(data_nlp.loc[x,'comfort_food'])
    new_column.append(a)

    b = chunk_NP(data_nlp.loc[x, 'comfort_food_reasons'])
    new_column2.append(b)

flat_list = [item for sublist in new_column for item in sublist]



print(set(flat_list))

{'pasta dishes', 'squash', 'food', 'pretzals', 'cups', 'candy', 'chicken wings', 'grandma homemade chocolate cake anything homemade', 'salty snacks', 'rice', 'toast', 'fritos', 'peanut butter', 'nuggets', 'noodle', 'ice-cream', 'any kinds', 'soup', 'chocolate brownie', 'chicken', 'moes', 'chips', 'doughnuts', 'cornbread', 'kit kat', 'popcorn', 'twizzlers', 'fast food', 'wings', 'cookie dough', 'ice crea', 'pot pie', 'chips sweets', 'pretzels', 'almonds', 'coffee', 'chicken curry', 'chocolate', 'grapes', 'reese', 'tikka masala', 'egg', 'chicken nuggs', 'home', 'salt', 'candy pop chocolate chipotle moe', 'milkshakes', 'fruit snacks', 'mix', 'pasta', 'grandma', 'watermelon', 'spaghetti', 'protein bars', 'cake', 'ranch', 'burgers', 'dr. pepper', 'cheesecake', 'omelet', 'none', 'beef jerky', 'mozzarella sticks', 'foods', 'pierogies', 'dip', 'macaroons', 'doritos', 'potatoes', 'icecream', 'deli sandwhich', 'snacks', 'cheese', 'potato', 'tuna sandwich', 'peppers', 'butter', 'cheez-its', 'chee

### These are the results from the unique value from comfort_food:

{'doughnuts', 'pot pie', 'salt', 'carrots', 'mix', 'pop', 'fruit snacks', 'fast food', 'coffee', 'grandma', 'omelet', 'butter naan', 'chicken', 'icecream', 'ranch', 'meatball sub', 'soda', 'cereal', 'beef jerky', 'potato', 'brownies', 'tomato soup', 'candy pop chocolate chipotle moe', 'pizza chocolate chips bagels ice capps', 'potato chips', 'ice-cream', 'sushi', 'burritos', 'doritos', 'reese', 'squash', 'bread', 'pizza', 'dr. pepper', 'twizzlers', 'burgers', 'yogurt', 'ice crea', 'broccoli', 'any kinds', 'peanut butter', 'chocolates', 'mozzarella sticks', 'sponge candy', 'cheesecake', 'cheez-its', 'dessets', 'chilli', 'ice cream/milkshake', 'grandma homemade chocolate cake anything homemade', 'snacks', 'fruit', 'moes', 'mcdonalds', 'vinegar chips', 'wine', 'salty snacks', 'fritos', 'deli sandwhich', 'fries', 'pepsi', 'cheese', 'potato soup', 'donuts', 'chicken curry', 'wings', 'burger', 'cookies', 'truffles', 'chocolate brownie', 'banana sandwich', 'chicken fingers', 'chips sweets', 'pierogies', 'nuggets', 'potatoes', 'cookie dough', 'subs', 'cake', 'pasta dishes', 'slim jims', 'protein bars', 'sweets', 'soup', 'chex-mix', 'peppers', 'bread/crackers', 'dip', 'quinoa', 'lasagna', 'chips', 'tikka masala', 'pizza / wings / cheesecake', 'toast', 'pretzals', 'egg', 'popcorn', 'peanut butter sandwich', 'salsa', 'pasta', 'chocolate', 'candy', 'cheeseburgers', 'tuna sandwich', 'grapes', 'home', 'foods', 'macaroni', 'cornbread', 'chicken wings', 'chocolate ice cream', 'chocolate bar', 'watermelon', 'pancakes', 'lasagne', 'almonds', 'cottage cheese', 'milkshakes', 'chicken nuggs', 'pretzels', 'ritz', 'macaroons', 'noodle', 'nutella', 'cups', 'fires', 'rice', 'none', 'mac', 'debbie snacks', 'ice cream', 'candy bars', 'spaghetti', 'food', 'pizza cookies', 'butter', 'kit kat', 'hamburgers'}


#### I manually removed certain words from the list that were not appropriate for describing food.


In [99]:
list_keep_1st = {'doughnuts', 'pot pie', 'carrots', 'mix', 'fruit snacks', 'fast food', 'coffee', 'grandma', 'omelet', 'butter naan', 'chicken', 'meatball sub', 'soda', 'cereal', 'beef jerky', 'potato', 'brownies', 'tomato soup', 'potato chips', 'sushi', 'burritos', 'doritos', 'reese', 'squash', 'bread', 'pizza', 'dr. pepper', 'twizzlers', 'burgers', 'yogurt', 'broccoli', 'peanut butter', 'chocolates', 'mozzarella sticks', 'sponge candy', 'cheesecake', 'cheez-its', 'dessets', 'chilli', 'snacks', 'fruit', 'moes', 'mcdonalds', 'vinegar chips', 'wine', 'salty snacks', 'fritos', 'deli sandwhich', 'fries', 'pepsi', 'cheese', 'potato soup', 'donuts', 'chicken curry', 'wings', 'burger', 'cookies', 'truffles', 'chocolate brownie', 'banana sandwich', 'chicken fingers', 'chips sweets', 'pierogies', 'nuggets', 'potatoes', 'cookie dough', 'subs', 'cake', 'pasta dishes', 'slim jims', 'protein bars', 'sweets', 'soup', 'chex-mix', 'peppers', 'dip', 'quinoa', 'lasagna', 'chips', 'tikka masala', 'pizza / wings / cheesecake', 'toast', 'pretzals', 'egg', 'popcorn', 'peanut butter sandwich', 'salsa', 'pasta', 'chocolate', 'candy', 'cheeseburgers', 'tuna sandwich', 'grapes','foods', 'macaroni', 'cornbread', 'chicken wings', 'chocolate ice cream', 'chocolate bar', 'watermelon', 'pancakes', 'lasagne', 'almonds', 'cottage cheese', 'milkshakes', 'chicken nuggs', 'pretzels', 'ritz', 'macaroons', 'noodle', 'nutella', 'fires', 'rice', 'none', 'debbie snacks', 'ice cream', 'candy bars', 'spaghetti', 'pizza cookies', 'butter', 'kit kat', 'hamburgers'}

list_removal_1st = set(flat_list)-list_keep_1st
print(list_removal_1st) ##TODO Check what words werer sorted out 


{'bread/crackers', 'icecream', 'pop', 'food', 'ice-cream', 'cups', 'home', 'salt', 'ranch', 'any kinds', 'candy pop chocolate chipotle moe', 'pizza chocolate chips bagels ice capps', 'ice cream/milkshake', 'grandma homemade chocolate cake anything homemade', 'ice crea', 'mac'}


In [100]:
def find_sentences_with_words(sentences, words):
    found_sentences = []
    for sentence in sentences:
        sentence_words = sentence.split()
        for word in words:
            if word.lower() in [s.lower() for s in sentence_words]:
                found_sentences.append(sentence)
                break
    return found_sentences

In [103]:
for x in data_nlp['comfort_food']:
    print(x if type(x)== float)

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

In [101]:
# data_nlp.loc[:,'comfort_food'] = data_nlp['comfort_food'].astype(str)

incorrect_food_name = find_sentences_with_words(data_nlp["comfort_food"].tolist(), list_removal_1st)

for x in incorrect_food_name: 
    print(x)


AttributeError: 'float' object has no attribute 'split'

chinese food, korean food 
mac and cheese vs mac n cheese


# testing

In [None]:
string2 = "ice cream, pizza, chinese food".lower()
string1 = "tomato soup, pizza, fritos, meatball sub, dr. pepper".lower()
string3 = "chex mix,wegmans"
get_Tree(string3).draw()


In [None]:
def get_Tree(text):
    grammar =  r'''
    NP: {<FT><NN>|<RB>?<NN.*>+|<NN><CC><NN>|<DT>?<NN|NNS>+}
    '''

    custom_tags_foodtypes = ['chinese', 'korean']
    custom_tags_waste = ['grandma', 'homemade']

    tokens = nltk.word_tokenize(text)

    ## FT stand for food type
    lotr_pos_tags = [
        (word, 'FT') if word in custom_tags_foodtypes
    else (word, tag) for word, tag in nltk.pos_tag(tokens)]
                 
    # lotr_pos_tags = nltk.pos_tag(tokens) ## previous version


    chunk_parser = nltk.RegexpParser(grammar)
    tree = chunk_parser.parse(lotr_pos_tags)

    return tree

In [None]:
string2 = "pasta, grandma homemade chocolate cake anything homemade ".lower()
string1 = "frozen yogurt, pizza, fast food".lower()
get_Tree(string2).draw()

In [None]:

for x in c:
    print(x)
 

In [None]:
'''This is an additional function I developed to sanitize informal language. Given my expertise, I couldn't come up with a more suitable alternative.'''

def informal2formal(text):

    text = remove_nonaplpha(text)
    formal_tokens = []
    informal_to_formal = {

        "n" : "and",
        "crea" : "cream"
 
    }

    tokens = nltk.word_tokenize(text)

    for token in tokens:
        if informal_to_formal.get(token) is not None:
            formal_tokens.append(informal_to_formal.get(token))
        else:
            formal_tokens.append(token)
    
    ## convert plural to singular
    formal_tokens = [lemmatizer.lemmatize(word)for word in formal_tokens]

    return formal_tokens
 


In [None]:
import re
def remove_nonaplpha(text):
    # Remove any non-alphabetic characters except for hyphens and spaces
    text = re.sub(r'[^a-zA-Z\s\,\/-]', ' ', text)
    # Replace multiple spaces or hyphens with a single space
    text = re.sub(r'[\s]+', ' ', text)
    # Replace hyphens with a space
    text = text.replace('-', ' ')
    ## Replace foward slash with a comma
    text = text.replace('/', ', ')
    # Trim leading and trailing spaces
    text = text.strip()
    
    return text.lower()


In [None]:
''' This is main function that chunk the sentence into word with POS tag using parser. It will also create tree that shows structure of sentence '''
def chunk_NP(text, origin = True):
    grammar =  r'''
    NP: { <RB>?<NN>*|<NN><CC><NN>|<DT>?<JJ>*<NN|NNS>+|}
    '''

    '''
    The rule states that whenever the chunk finds an optional determiner (DT) followed by any number of adjectives (JJ) and then a noun (NN) then the Noun Phrase(NP) chunk should be formed.
    '''


    if type(text) is not str:
        return ['none']
    else:    
        if origin:
            text = informal2formal(text)
            
        lotr_pos_tags = nltk.pos_tag(text)
        chunk_parser = nltk.RegexpParser(grammar)
        tree = chunk_parser.parse(lotr_pos_tags)

        noun_phrases = []
        for subtree in tree.subtrees():
            if subtree.label() =='NP':
                np_parts = []
                for leaf in subtree.leaves():
                    np_parts.append(leaf[0])
                noun_phrases.append(" ".join(np_parts))
        return noun_phrases


In [None]:
def get_Tree(text):
    grammar =  r'''
    NP: { <RB>?<NN>*|<NN><CC><NN>|<DT>?<JJ>*<NN|NNS>+|}
    '''
    
    # grammar = r'NP:{<NN><CC><NN> }'
    '''RB to extract 'and'
    '''

    words_in_text = informal2formal(text)
    lotr_pos_tags = nltk.pos_tag(words_in_text)
    chunk_parser = nltk.RegexpParser(grammar)
    tree = nltk.ne_chunk(lotr_pos_tags)

    return tree


In [None]:

for x in range(len(data_nlp["comfort_food"])):

    a = chunk_NP(data_nlp.loc[x,'comfort_food'])
    data_nlp.at[x,"comfort_food"] = a

    b = chunk_NP(data_nlp.loc[x, 'comfort_food_reasons'])
    data_nlp.at[x,'comfort_food_reasons'] = b    

In [None]:
data_cleaning.loc[:,['comfort_food','comfort_food_reasons', 'diet_current']] = data_nlp


In [None]:
for x in data_nlp_prev['comfort_food']:
    print(x)

In [None]:
cf_explode = data_cleaning.explode('comfort_food') 
## The explode() function is used to transform each element of a list-like to a row, replicating the index values.

print(cf_explode['comfort_food'].unique())
print(cf_explode['comfort_food'].unique().shape)
print(cf_explode['comfort_food'].shape)

grandma homemade chocolate cake anything homemade
ice crea

In [None]:
string = "Pizza / Wings / Cheesecake".lower()


string2 = "grandma homemade chocolate cake anything homemade".lower()

string3 = "Saltfish, Candy and Kit Kat "

string4 = "chocolate,ice cream/milkshake,cooky"
# print(chunk_NP(string2))
get_Tree(string2).draw()
# print(nltk.word_tokenize(string2))
print(informal2formal(string))

In [None]:
from autocorrect import Speller


spell = Speller(lang='en')
text = "ice cream/milkshake"
corrected_text = spell(text)

print(corrected_text)