# <img src="https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcTE2v_RBFRd6_fPwA5wYa0gjfhV3M2lzCp5uQ&usqp=CAU" width="240" height="360" />

 # Machine Learning on Candy dataset

# Table of Contents

1. [Problem Statement](#section1)<br>
2. [Data Loading and Description](#section2)
3. [Preprocessing](#section3)
4. [Model fitting and evaluation](#section4)<br>

<a id=section1></a>

## 1. Problem Statement

 The goal is to predict winpercent of halloween candies and find best fitting model from Linear Regression, Decision Tree and Random Forest.

<a id=section2></a>

## 2. Data Loading and Description

<a id=section201></a>

- Data was collected by creating a website where participants were shown presenting two fun-sized candies and asked to click on the one they would prefer to receive.
- The dataset comprises of __85 candies and 13 feature columns__. Below is a table showing names of all the columns and their description.

| Column Name     | Description                                               |
| -------------   |:-------------                                            :| 
| competitorname  | Name of competitor candy                                  | 
| chocolate       | Does it contain chocolate?                                |  
| fruity          | Is it fruit flavored?                                     | 
| caramel         | Is there caramel in the candy?                            |   
| peanutalmondy   | Does it contain peanuts, peanut butter or almonds?        |
| nougat          | Does it contain nougat?                                   |
| crispedricewafer| Number of sibling and/or spouse travelling with passenger |
| hard            | Is it a hard candy?                                       |
| bar             | Is it a candy bar?                                        |
| pluribus        | Is it one of many candies in a bag or box?                |
| sugarpercent    | The percentile of sugar it falls under within the data set|
| pricepercent    | The unit price percentile compared to the rest of the set |
| winpercent      | The overall win percentage                                |

#### Importing packages                                          

In [1]:
#importng calculation, visualization, dataframe, ML model creation packages

import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt


from sklearn import metrics 

import numpy as np 

# allow plots to appear directly in the notebook
%matplotlib inline

  'Matplotlib is building the font cache using fc-list. '


#### Importing the Dataset

In [3]:
#load data

data = pd.read_csv("https://raw.githubusercontent.com/insaid2018/Term-2/master/Projects/candy-data.csv")
data.head()

Unnamed: 0,competitorname,chocolate,fruity,caramel,peanutyalmondy,nougat,crispedricewafer,hard,bar,pluribus,sugarpercent,pricepercent,winpercent
0,100 Grand,1,0,1,0,0,1,0,1,0,0.732,0.86,66.971725
1,3 Musketeers,1,0,0,0,1,0,0,1,0,0.604,0.511,67.602936
2,One dime,0,0,0,0,0,0,0,0,0,0.011,0.116,32.261086
3,One quarter,0,0,0,0,0,0,0,0,0,0.011,0.511,46.116505
4,Air Heads,0,1,0,0,0,0,0,0,0,0.906,0.511,52.341465


In [4]:
#data info

print(data.shape)
data.info()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [5]:
data.describe()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

<a id=section3></a>

## 3. Preprocessing the data

In [None]:
#Distribution of Target variable

f, axes = plt.subplots( figsize=(12, 10), sharex=True) 
sns.despine(left=True)

sns.distplot(data.winpercent, color="b")

Target variable "Winpercent" is Normally distributed.

In [None]:
#Distribution of continuous Explanatory variable

f, axes = plt.subplots( figsize=(12, 10), sharex=True) 
sns.despine(left=True)
sns.distplot(data.sugarpercent, color="r")

In [None]:
f, axes = plt.subplots( figsize=(12, 10), sharex=True) 
sns.despine(left=True)
sns.distplot(data.pricepercent, color="g")

Explanatory variables - sugarpercent and pricepercent are Normally Distributed.

In [None]:
#Modify winpercent to make it similar to other percent values.
data['winpercent'] = data['winpercent']/100
data.head(10)

In [None]:
#Popularity of various candies
popularity = data[['competitorname','winpercent']].sort_values(by='winpercent')
pd.concat([popularity.head(5),popularity.tail(5)],axis=0).plot(x='competitorname',y='winpercent',kind='barh',title='Popularity of various candies',sort_columns=True,figsize = (10,5),legend=False)

In [None]:
# Top 5 most popular and least popular candies
popularity = data.sort_values(by=['winpercent'], ascending=True)
pd.concat([popularity.head(5),popularity.tail(5)],axis=0).plot(x='competitorname',y='winpercent',kind='barh',title='Popularity of various candies',sort_columns=True,figsize = (10,5),legend=True);

In [None]:
# Competitors which are not chocolaty but winners
nochocolate = data[data['chocolate']==0].sort_values(by=['winpercent'], ascending=True).head(10)
nochocolate

In [None]:
nochocolate.plot(x='competitorname', y='winpercent', kind='barh', title='non chocolaty winners', sort_columns=True,figsize = (10,5), legend=True);

In [None]:
# Top liked and cheap competitors.
d2 = data.sort_values(by=['winpercent', 'pricepercent'], ascending=False).head(10)
d2
d2.plot(x='competitorname', y='winpercent', kind='barh', title='top winbyprice competitors', sort_columns=True,figsize = (10,5), legend=True);

In [None]:
# top 10 sugary candies
d3=data.sort_values(by=['sugarpercent'], ascending=True).head(10)
d3.plot(x='competitorname', y='sugarpercent', kind='barh', title='top sugary candies',figsize = (10,5), legend=True);

In [None]:
# chocolaty as well as fruity candies
data[(data['chocolate']==1)&(data['fruity']==1)]

#Establishing __correlation__ between all the features using __heatmap__.

In [None]:
# correlation heatmap
plt.figure(figsize = (20,8))        
sns.heatmap(data.corr(),annot=True, cmap = 'coolwarm')

Correlation between all variables is <0.8.

## 4. Model fitting and evaluation 

<a id=section501></a>

In [None]:
# prepare data for training set

#X- all columns except species
X = data.iloc[:, 1:-1]

#Y - last column
y = data.iloc[:, -1]

print(X,y)

In [None]:
#Test-train split
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Model1 - Lineaar regression
from sklearn.linear_model import LinearRegression
from sklearn import metrics

linreg = LinearRegression()

linreg.fit(X_train,y_train) #calculation of coefficients

y_pred_test= linreg.predict(X_test) 
RMSE_test = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))  


y_pred_train = linreg.predict(X_train) 
RMSE_train_LR = np.sqrt(metrics.mean_squared_error(y_train, y_pred_train)) 

In [None]:
y_pred_test

In [None]:
print('Intercept:',linreg.intercept_)                                    
print('Coefficients:',linreg.coef_) 

In [None]:
y_test

In [None]:
RMSE_test_LR

In [None]:
#Model2 - DecisionTreeRegression
from sklearn.tree import DecisionTreeRegressor
DTreg = DecisionTreeRegressor(max_depth=5)
DTreg.fit(X_train,y_train)
y_pred_test= DTreg.predict(X_test) 
RMSE_test_DT = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test)) 
RMSE_test_DT

In [None]:
#Model3 - RandomForestRegression
from sklearn.ensemble import RandomForestRegressor
RFreg = RandomForestRegressor(n_estimators=200)
RFreg.fit(X_train, y_train)
y_pred_test = RFreg.predict(X_test)
RMSE_test =np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))
RMSE_test_DT