##### import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv("C:\\Users\\KS100\\Documents\\nutrition.csv")
data.head()
data.info()

#Drop the serving_size column
data.drop('serving_size', axis = 1, inplace = True)
#Fill the NaN value with 0
data.fillna(0, inplace = True)

import re
#Looping in each non-numerical features except the name feature 
for col in data.drop('name',axis = 1).select_dtypes(exclude = 'number').columns:
    for i in data[col]:
        if i == '0' or i == 0:
            pass
        else:
            point = re.findall('[a-zA-Z]+',i)[0]
            replace = []
            if point == 'mg':
                for j in data[col]:
                    if j == '0' or j == 0:
                        replace.append(float(j))
                    else:
                        replace.append(float(re.sub('[a-zA-Z]','',j))/1000)
            elif point == 'mcg':
                for j in data[col]:
                    if j == '0' or j == 0:
                        replace.append(float(j))
                    else:
                        replace.append(float(re.sub('[a-zA-Z]','',j))/1000000)  
            else:
                 for j in data[col]:
                    if j == '0' or j == 0:
                        replace.append(float(j))
                    else:       
                        replace.append(float(re.sub('[a-zA-Z]','',j)))
                        
            data[col] = replace    
            data.rename({col:col+'(g)'}, axis =1, inplace = True)
            break

data['food_categories'] = data['name'].apply(lambda x: x.split(',')[0])

for i in data.select_dtypes('number').columns:
    sns.displot(data[i])
    plt.title(i)
    plt.show()

pd.set_option('display.max_columns', None)
data.agg(['mean', 'median', 'std', 'skew', 'kurtosis'])

data['calories'].describe()

#Specifying the limit
cal_Q1 = data.describe()['calories']['25%']
cal_Q3 = data.describe()['calories']['75%']
cal_IQR = cal_Q3 - cal_Q1
data[(data['calories'] < 1.5 * (cal_Q1 - cal_IQR)) | (data['calories'] > 1.5 * (cal_Q3 + cal_IQR)) ]['food_categories'].value_counts()

#Importing the transformer
from sklearn.preprocessing import StandardScaler
#Transforming the data, I drop the name feature as we only need the numerical column
scaler = StandardScaler()
training = pd.DataFrame(scaler.fit_transform(data.drop('name', axis =1)), columns = data.drop('name', axis =1).columns)

from scipy.cluster.hierarchy import linkage, dendrogram
#Ward is the most common linkage method
Z = linkage(training,method = 'ward')
dendrogram(Z, truncate_mode = 'lastp')
plt.xticks(rotation = 90, fontsize = 10)
plt.ylabel('Distance')
plt.xlabel('Cluster')
plt.title('Agglomerative Clustering')

from sklearn.cluster import AgglomerativeClustering
#I specify n_clusters to be 2 based on our previous analysis
ach = AgglomerativeClustering(n_clusters = 2)
ach.fit(training)
#Input the label result to the data
data['label'] = ach.labels_

from sklearn.manifold import TSNE
#t-SNE is based on a stochastic (random) process, that is why I set the random_state so we could repeat the result
tsne = TSNE(random_state=0)
tsne_results = tsne.fit_transform(training) 
tsne_results=pd.DataFrame(tsne_results, columns=['tsne1', 'tsne2'])
#Visualize the data
tsne_results['label'] = data['label']
sns.scatterplot(data = tsne_results, x = 'tsne1', y = 'tsne2', hue='label')
plt.show()


for i in data.select_dtypes('number').columns:
    sns.distplot(data[data['label'] == 0][i], label = 'label 0')
    sns.distplot(data[data['label'] == 1][i], label = 'label 1')
    plt.title(i)
    plt.legend()
    plt.show()
    
#Food label 0
data[data['label'] == 0]['food_categories'].value_counts()

#Food label 1
data[data['label'] == 1]['food_categories'].value_counts()
