In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('Food_Preference.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Timestamp       288 non-null    object
 1   Participant_ID  288 non-null    object
 2   Gender          284 non-null    object
 3   Nationality     288 non-null    object
 4   Age             288 non-null    int64 
 5   Food            288 non-null    object
 6   Juice           288 non-null    object
 7   Dessert         288 non-null    object
dtypes: int64(1), object(7)
memory usage: 18.1+ KB


In [4]:
data.head()

Unnamed: 0,Timestamp,Participant_ID,Gender,Nationality,Age,Food,Juice,Dessert
0,2019/05/07 2:59:13 PM GMT+8,FPS001,Male,Indian,24,Traditional food,Fresh Juice,Maybe
1,2019/05/07 2:59:45 PM GMT+8,FPS002,Female,Indian,22,Western Food,Carbonated drinks,Yes
2,2019/05/07 3:00:05 PM GMT+8,FPS003,Male,Indian,31,Western Food,Fresh Juice,Maybe
3,2019/05/07 3:00:11 PM GMT+8,FPS004,Female,Indian,25,Traditional food,Fresh Juice,Maybe
4,2019/05/07 3:02:50 PM GMT+8,FPS005,Male,Indian,27,Traditional food,Fresh Juice,Maybe


In [5]:
data.describe()

Unnamed: 0,Age
count,288.0
mean,30.597222
std,11.180383
min,8.0
25%,24.0
50%,28.0
75%,36.25
max,80.0


In [6]:
data.drop(['Participant_ID', 'Timestamp'], axis=1, inplace=True)

In [8]:
data.columns

Index(['Gender', 'Nationality', 'Age', 'Food', 'Juice', 'Dessert'], dtype='object')

Missing Data Values

In [9]:
data.dropna(axis=0, inplace=True)
data.reset_index(drop=True, inplace=True)

In [10]:
data['Age']

0      24
1      22
2      31
3      25
4      27
       ..
279    27
280    24
281    25
282    27
283    27
Name: Age, Length: 284, dtype: int64

In [15]:
data['Age'].mean()

30.654929577464788

In [13]:
# divide them into 2 as yound and old
age_bins = pd.qcut(data['Age'], q=2, labels=[0,1])

In [14]:
pd.concat([data['Age'], age_bins], axis=1)

Unnamed: 0,Age,Age.1
0,24,0
1,22,0
2,31,1
3,25,0
4,27,0
...,...,...
279,27,0
280,24,0
281,25,0
282,27,0


In [16]:
data['Age'] = age_bins

In [17]:
data['Age']

0      0
1      0
2      1
3      0
4      0
      ..
279    0
280    0
281    0
282    0
283    0
Name: Age, Length: 284, dtype: category
Categories (2, int64): [0 < 1]

Encoding

In [18]:
data.columns

Index(['Gender', 'Nationality', 'Age', 'Food', 'Juice', 'Dessert'], dtype='object')

In [19]:
cat_feats = ['Gender', 'Nationality', 'Food', 'Juice', 'Dessert']

In [20]:
#getting unique objects in each category

def get_uniques(df, columns):
    return {column: list(df[column].unique()) for column in columns}

In [21]:
get_uniques(data, cat_feats)

{'Gender': ['Male', 'Female'],
 'Nationality': ['Indian',
  'Pakistani ',
  'Tanzanian',
  'Indonesia',
  'Pakistan',
  'Maldivian ',
  'MY',
  'Malaysian',
  'Malaysian ',
  'Indonesian ',
  'Maldivian',
  'MALAYSIAN',
  'Malaysia ',
  'Pakistani',
  'Canadian',
  'Nigerian ',
  'Algerian ',
  'Korean ',
  'Seychellois',
  'Indonesain',
  'Indonesian',
  'Malaysia',
  'Japan',
  'China',
  'Mauritian',
  'Yemen'],
 'Food': ['Traditional food', 'Western Food'],
 'Juice': ['Fresh Juice', 'Carbonated drinks'],
 'Dessert': ['Maybe', 'Yes', 'No']}

In [22]:
binary_features = ['Gender', 'Food', 'Juice']

ordinal_features = ['Dessert']

nominal_features = ['Nationality']

In [23]:
def binary_encode(df, column, positive_label):
    df = df.copy()
    df[column] = df[column].apply(lambda x: 1 if x == positive_label else 0)
    return df

In [24]:
def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

In [25]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column])
    df = pd.concat([df, dummies], axis=1)
    df.drop(column, axis=1, inplace=True)
    return df

In [26]:
data = binary_encode(data, 'Gender', 'Male')
data = binary_encode(data, 'Food', 'Traditional food')
data = binary_encode(data, 'Juice', 'Fresh Juice')

dessert_ordering = ['No', 'Maybe', 'Yes']
data = ordinal_encode(data, 'Dessert', dessert_ordering)

data = onehot_encode(data, 'Nationality')

In [27]:
data

Unnamed: 0,Gender,Age,Food,Juice,Dessert,Algerian,Canadian,China,Indian,Indonesain,...,Maldivian,Maldivian.1,Mauritian,Nigerian,Pakistan,Pakistani,Pakistani.1,Seychellois,Tanzanian,Yemen
0,1,0,1,1,1,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1,0,0,0,0,2,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2,1,1,0,1,1,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,0,0,1,1,1,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,1,0,1,1,1,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,1,0,0,1,2,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
280,1,0,1,1,2,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
281,1,0,1,1,2,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
282,1,0,1,1,2,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


Scaling and Splitting

In [28]:
y = data['Age']
x = data.drop(['Age'], axis=1)

In [29]:
scaler = MinMaxScaler()

x = scaler.fit_transform(x)

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7)

Train our Data

In [32]:
model = LogisticRegression()
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.6976744186046512