In [1]:
#importing different libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")   # ignore if any warnings are there

In [38]:
# loading the dataset
data = pd.read_csv(r"C:\Jupyter\DSML\DSML\Logistic Regression\candy-data.csv")
df = pd.DataFrame(data)
df.drop_duplicates(inplace=True) # drop duplicates if any.
df.shape # num rows x num columns.

(85, 13)

In [39]:
#Check for missing values
(df.isnull().sum()/len(df)*100).sort_values(ascending=False)

winpercent          0.0
pricepercent        0.0
sugarpercent        0.0
pluribus            0.0
bar                 0.0
hard                0.0
crispedricewafer    0.0
nougat              0.0
peanutyalmondy      0.0
caramel             0.0
fruity              0.0
chocolate           0.0
competitorname      0.0
dtype: float64

In [40]:
df.head(3)

Unnamed: 0,competitorname,chocolate,fruity,caramel,peanutyalmondy,nougat,crispedricewafer,hard,bar,pluribus,sugarpercent,pricepercent,winpercent
0,100 Grand,1,0,1,0,0,1,0,1,0,0.732,0.86,66.971725
1,3 Musketeers,1,0,0,0,1,0,0,1,0,0.604,0.511,67.602936
2,One dime,0,0,0,0,0,0,0,0,0,0.011,0.116,32.261086


In [41]:
df.describe()

Unnamed: 0,chocolate,fruity,caramel,peanutyalmondy,nougat,crispedricewafer,hard,bar,pluribus,sugarpercent,pricepercent,winpercent
count,85.0,85.0,85.0,85.0,85.0,85.0,85.0,85.0,85.0,85.0,85.0,85.0
mean,0.435294,0.447059,0.164706,0.164706,0.082353,0.082353,0.176471,0.247059,0.517647,0.478647,0.468882,50.316764
std,0.498738,0.50014,0.373116,0.373116,0.276533,0.276533,0.383482,0.433861,0.502654,0.282778,0.28574,14.714357
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011,0.011,22.445341
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22,0.255,39.141056
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.465,0.465,47.829754
75%,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.732,0.651,59.863998
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.988,0.976,84.18029


In [42]:
df['winpercent'] = df['winpercent']/100

In [43]:
df['sugarbyprice'] = df['sugarpercent'].div(df['pricepercent']) # higher value means the candy is sweet as well as cheap.
df['winbyprice'] = df['winpercent'].div(df['pricepercent']) # higher value means the candy is more liked as well as cheap.

In [44]:
df['competitorname'] = df['competitorname'].str.replace('Õ', "'") # Special character was appearing in name of candy.
df.sort_values(by=['winpercent', 'sugarpercent'], ascending=False).head()
#Reese's seem to be a favourite. Note that all the top competitors are chocolaty as well. 
#Also, Reese's Miniatures is very cheap when compared to top competitors and overall as well.

Unnamed: 0,competitorname,chocolate,fruity,caramel,peanutyalmondy,nougat,crispedricewafer,hard,bar,pluribus,sugarpercent,pricepercent,winpercent,sugarbyprice,winbyprice
52,Reese's Peanut Butter cup,1,0,0,1,0,0,0,0,0,0.72,0.651,0.841803,1.105991,1.293092
51,Reese's Miniatures,1,0,0,1,0,0,0,0,0,0.034,0.279,0.818663,0.121864,2.934274
79,Twix,1,0,1,0,0,1,0,1,0,0.546,0.906,0.816429,0.602649,0.901136
28,Kit Kat,1,0,0,0,0,1,0,1,0,0.313,0.511,0.767686,0.612524,1.502321
64,Snickers,1,0,1,1,1,0,0,1,0,0.546,0.651,0.766738,0.83871,1.177785


In [45]:
#Competitors which are not chocolaty but winners.
df[df['chocolate']==0].sort_values(by=['winpercent', 'sugarpercent'], ascending=False).head(10)

Unnamed: 0,competitorname,chocolate,fruity,caramel,peanutyalmondy,nougat,crispedricewafer,hard,bar,pluribus,sugarpercent,pricepercent,winpercent,sugarbyprice,winbyprice
68,Starburst,0,1,0,0,0,0,0,0,1,0.151,0.22,0.670376,0.686364,3.047165
60,Skittles original,0,1,0,0,0,0,0,0,1,0.941,0.22,0.630851,4.277273,2.867506
66,Sour Patch Kids,0,1,0,0,0,0,0,0,1,0.069,0.116,0.59864,0.594828,5.160689
18,Haribo Gold Bears,0,1,0,0,0,0,0,0,1,0.465,0.465,0.571197,1.0,1.228382
41,Nerds,0,1,0,0,0,0,1,0,1,0.848,0.325,0.55354,2.609231,1.703201
61,Skittles wildberry,0,1,0,0,0,0,0,0,1,0.941,0.22,0.551037,4.277273,2.504713
73,Swedish Fish,0,1,0,0,0,0,0,0,1,0.604,0.755,0.548611,0.8,0.726637
31,Lifesavers big ring gummies,0,1,0,0,0,0,0,0,0,0.267,0.279,0.529114,0.956989,1.896466
67,Sour Patch Tricksters,0,1,0,0,0,0,0,0,1,0.069,0.116,0.528259,0.594828,4.553961
4,Air Heads,0,1,0,0,0,0,0,0,0,0.906,0.511,0.523415,1.772994,1.024295


In [46]:
#checking the corelation between all the independent columns
df.corr().abs().style.background_gradient(cmap='coolwarm')

Unnamed: 0,chocolate,fruity,caramel,peanutyalmondy,nougat,crispedricewafer,hard,bar,pluribus,sugarpercent,pricepercent,winpercent,sugarbyprice,winbyprice
chocolate,1.0,0.741721,0.249875,0.377824,0.254892,0.34121,0.344177,0.597421,0.339675,0.104169,0.504675,0.636517,0.158692,0.029718
fruity,0.741721,1.0,0.335485,0.39928,0.269367,0.269367,0.390678,0.515066,0.299725,0.034393,0.430969,0.380938,0.140281,0.0112828
caramel,0.249875,0.335485,1.0,0.0593561,0.328493,0.213113,0.122355,0.33396,0.269585,0.221933,0.254327,0.213416,0.12015,0.135568
peanutyalmondy,0.377824,0.39928,0.0593561,1.0,0.213113,0.0176463,0.205557,0.26042,0.206109,0.0878893,0.309153,0.406192,0.154065,0.128925
nougat,0.254892,0.269367,0.328493,0.213113,1.0,0.0897436,0.138675,0.522976,0.310339,0.123081,0.153196,0.199375,0.0813281,0.0890699
crispedricewafer,0.34121,0.269367,0.213113,0.0176463,0.0897436,1.0,0.138675,0.423751,0.224693,0.0699497,0.328265,0.32468,0.112514,0.0966673
hard,0.344177,0.390678,0.122355,0.205557,0.138675,0.138675,1.0,0.265165,0.0145317,0.0918097,0.244365,0.310382,0.310325,0.0337946
bar,0.597421,0.515066,0.33396,0.26042,0.522976,0.423751,0.265165,1.0,0.593409,0.0999852,0.518407,0.429929,0.198212,0.182976
pluribus,0.339675,0.299725,0.269585,0.206109,0.310339,0.224693,0.0145317,0.593409,1.0,0.0455228,0.220794,0.247448,0.131104,0.166855
sugarpercent,0.104169,0.034393,0.221933,0.0878893,0.123081,0.0699497,0.0918097,0.0999852,0.0455228,1.0,0.329706,0.229151,0.171254,0.244067


In [47]:
# Create correlation matrix
corr_matrix = df.corrwith(df.chocolate).abs()
corr_matrix

chocolate           1.000000
fruity              0.741721
caramel             0.249875
peanutyalmondy      0.377824
nougat              0.254892
crispedricewafer    0.341210
hard                0.344177
bar                 0.597421
pluribus            0.339675
sugarpercent        0.104169
pricepercent        0.504675
winpercent          0.636517
sugarbyprice        0.158692
winbyprice          0.029718
dtype: float64

In [48]:
#Remove columns which have less correlation to chocolate 
df=df.drop(['competitorname','caramel', 'peanutyalmondy','nougat','crispedricewafer','hard','pluribus','sugarpercent','sugarbyprice','winbyprice'], axis = 1) 

In [49]:
df.head(3)

Unnamed: 0,chocolate,fruity,bar,pricepercent,winpercent
0,1,0,1,0.86,0.669717
1,1,0,1,0.511,0.676029
2,0,0,0,0.116,0.322611


In [50]:
x=df.drop('chocolate',axis=1)
y=df.chocolate

In [51]:
#spiltting the dataset into train and test dataset
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=101)

In [52]:
model = LogisticRegression()
model.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [53]:
#predecting the value of y_pred
y_pred = model.predict(x_test)

In [54]:
#checking the accuracy score
accuracy_score(y_pred,y_test)

0.9310344827586207

In [55]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94        16
           1       1.00      0.85      0.92        13

   micro avg       0.93      0.93      0.93        29
   macro avg       0.94      0.92      0.93        29
weighted avg       0.94      0.93      0.93        29



In [56]:
print(confusion_matrix(y_pred,y_test))

[[16  0]
 [ 2 11]]
