In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import linear_model
% matplotlib inline

In [None]:
finalset = pd.read_csv("finalset.csv")

In [None]:
finalset.shape

In [None]:
depcols = ['review/appearance', 'review/aroma', 'review/overall',
           'review/palate', 'review/taste']

In [None]:
y_appearance = finalset['review/appearance']
y_aroma = finalset['review/aroma']
y_overall = finalset['review/overall']
y_palate = finalset['review/palate']
y_taste = finalset['review/taste']

In [None]:
xcols = finalset.drop(depcols, axis = 1)
xcols = xcols.drop(["index", "Unnamed: 0"], axis = 1)

In [None]:
xcols.shape

In [None]:
def BICcal(X, y, BIC_diff):
  cols = list(X.columns)
  good_vars = []
  good_vars_bic = []
  model = sm.OLS(endog=y, exog=X, missing='drop')
  results = model.fit()
  old_BIC =  np.inf
  while len(cols) > 0:
    bestbic = 100000000000
    bestvar = None
    for variable in cols:
      model = sm.OLS(endog=y, exog=X[good_vars + [variable]], missing='drop')
      results = model.fit()
      bic = results.bic
      if bic < bestbic:
        bestbic = bic
        bestvar = variable
    if (old_BIC - bestbic) > BIC_diff: 
      good_vars.append(bestvar)
      cols.remove(bestvar)
      good_vars_bic.append(bestbic)
      old_BIC = bestbic
    else:
      return good_vars, good_vars_bic
  return good_vars, good_vars_bic

In [None]:
appearancetraits, bic_currentappearance = BICcal(xcols, y_appearance, -.00001)

In [None]:
len(appearancetraits)

In [None]:
aromatraits, bic_currentaroma = BICcal(xcols, y_aroma, -.000001)
overalltraits, bic_currentoverall = BICcal(xcols, y_overall, -.000001)
palatetraits, bic_currentpalate = BICcal(xcols, y_palate, -.000001)
tastetraits, bic_currenttaste = BICcal(xcols, y_taste, -.000001)

In [None]:
print("unique to appearance")
print([x for x in appearancetraits if (x not in aromatraits and x not in overalltraits and x not in palatetraits and x not in tastetraits)])


In [None]:
print("unique to aroma")
print([x for x in aromatraits if (x not in appearancetraits and x not in overalltraits and x not in palatetraits and x not in tastetraits)])


In [None]:
print("unique to overall")
print([x for x in overalltraits if (x not in appearancetraits and x not in aromatraits and x not in palatetraits and x not in tastetraits)])


In [None]:
print("unique to palate")
print([x for x in palatetraits if (x not in appearancetraits and x not in aromatraits and x not in overalltraits and x not in tastetraits)])


In [None]:
print("unique to taste")
print([x for x in tastetraits if (x not in appearancetraits2 and x not in aromatraits and x not in palatetraits and x not in overalltraits)])


In [None]:
print("present in all:")
print([x for x in appearancetraits if (x in aromatraits and x in overalltraits and x in palatetraits and x in tastetraits)])

In [None]:
np.random.seed(0)
validx = np.random.choice(xcols.index, size = int(xcols.shape[0] * .1), replace = False)
trainidx = [x for x in xcols.index if x not in validx]

In [None]:
#for appearance

df1 = xcols[appearancetraits].copy()
X_train = df1.iloc[trainidx].values
X_val = df1.iloc[validx].values

y_train = y_appearance.iloc[trainidx]
y_val = y_appearance.iloc[validx]

regr = linear_model.LinearRegression().fit(X_train, y_train)
print("Train R^2: " + str(regr.score(X_train, y_train)))
print("Val MSE: " + str(sum((regr.predict(X_val) - y_val) ** 2) / X_val.shape[0]))

In [None]:
#for aroma

df1 = xcols[aromatraits].copy()
X_train = df1.iloc[trainidx].values
X_val = df1.iloc[validx].values

y_train = y_aroma.iloc[trainidx]
y_val = y_aroma.iloc[validx]

regr = linear_model.LinearRegression().fit(X_train, y_train)
print("Train R^2: " + str(regr.score(X_train, y_train)))
print("Val MSE: " + str(sum((regr.predict(X_val) - y_val) ** 2) / X_val.shape[0]))

In [None]:
#for overall

df1 = xcols[overalltraits].copy()
X_train = df1.iloc[trainidx].values
X_val = df1.iloc[validx].values

y_train = y_overall.iloc[trainidx]
y_val = y_overall.iloc[validx]

regr = linear_model.LinearRegression().fit(X_train, y_train)
print("Train R^2: " + str(regr.score(X_train, y_train)))
print("Val MSE: " + str(sum((regr.predict(X_val) - y_val) ** 2) / X_val.shape[0]))

In [None]:
#for palate

df1 = xcols[palatetraits].copy()
X_train = df1.iloc[trainidx].values
X_val = df1.iloc[validx].values

y_train = y_palate.iloc[trainidx]
y_val = y_palate.iloc[validx]

regr = linear_model.LinearRegression().fit(X_train, y_train)
print("Train R^2: " + str(regr.score(X_train, y_train)))
print("Val MSE: " + str(sum((regr.predict(X_val) - y_val) ** 2) / X_val.shape[0]))

In [None]:
#for taste

df1 = xcols[tastetraits].copy()
X_train = df1.iloc[trainidx].values
X_val = df1.iloc[validx].values

y_train = y_taste.iloc[trainidx]
y_val = y_taste.iloc[validx]

regr = linear_model.LinearRegression().fit(X_train, y_train)
print("Train R^2: " + str(regr.score(X_train, y_train)))
print("Val MSE: " + str(sum((regr.predict(X_val) - y_val) ** 2) / X_val.shape[0]))

In [None]:
def BICChart(bic, varnames, title):
  fig = plt.figure(figsize=(20,7))
  plt.xlabel("Attributes")
  plt.ylabel('BIC')
  plt.title(title)
  ax = plt.plot(range(len(varnames)), bic)
  _ = plt.xticks(range(len(varnames)), varnames, rotation = 45, ha = "right", fontsize = 12)

In [None]:
BICChart(bic_currentappearance, appearancetraits2, "Appearance Features")
BICChart(bic_currentoverall, overalltraits, "Overall Features")
BICChart(bic_currentaroma, aromatraits, "Aroma Features")
BICChart(bic_currentpalate, palatetraits, "Palate Features")
BICChart(bic_currenttaste, tastetraits, "Taste Features")