In [1]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import scipy


from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [22]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
df = pd.read_csv("/content/gdrive/My Drive/Dataset_for_gender_classification.csv")
#we have 4 features - favourite music genre, favourite color, fav beverage, fav soft drink, with 66 datapoints.
df.head()

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
0,Cool,Rock,Vodka,7UP/Sprite,F
1,Neutral,Hip hop,Vodka,Coca Cola/Pepsi,F
2,Warm,Rock,Wine,Coca Cola/Pepsi,F
3,Warm,Folk/Traditional,Whiskey,Fanta,F
4,Cool,Rock,Vodka,Coca Cola/Pepsi,F


In [4]:
df.columns

Index(['Favorite Color', 'Favorite Music Genre', 'Favorite Beverage',
       'Favorite Soft Drink', 'Gender'],
      dtype='object')

In [5]:
#Preprocessing
#converting dtype from object to categorical
for col_name in df.columns:
    if(df[col_name].dtype == 'object'):
        df[col_name]= df[col_name].astype('category')
        df[col_name] = df[col_name].cat.codes

In [6]:
df.head()

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
0,0,6,3,0,0
1,1,2,3,1,0
2,2,6,5,1,0
3,2,1,4,2,0
4,0,6,3,1,0


In [7]:
X = df.iloc[:,0:4]
Y = df.iloc[:,-1]

In [8]:
X.head()

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink
0,0,6,3,0
1,1,2,3,1
2,2,6,5,1
3,2,1,4,2
4,0,6,3,1


In [9]:
Y.head()

0    0
1    0
2    0
3    0
4    0
Name: Gender, dtype: int8

Random forest Classifier

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
clf=RandomForestClassifier(n_estimators=75, criterion = 'gini', max_depth=4, min_samples_split=4, max_features='log2', min_samples_leaf=5)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5


In [11]:
#To get tree printed
#estimator = clf.estimators_[5]
#from sklearn.tree import export_graphviz

#export_graphviz(estimator, out_file='tree.dot', rounded = True, proportion = False, precision = 2, filled = True)
#from subprocess import call
#call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
#from IPython.display import Image
#Image(filename = 'tree.png')



#export_graphviz(clf)


K Nearest neighbour

In [12]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=2)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


Accuracy: 0.7


In [13]:
df.corr() #to get correlation matrix

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
Favorite Color,1.0,-0.068996,0.228255,-0.04838,-0.115564
Favorite Music Genre,-0.068996,1.0,0.056462,-0.230102,-0.192661
Favorite Beverage,0.228255,0.056462,1.0,-0.177365,-0.114119
Favorite Soft Drink,-0.04838,-0.230102,-0.177365,1.0,0.18851
Gender,-0.115564,-0.192661,-0.114119,0.18851,1.0


K means Clustering


In [14]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

kmeans = KMeans( init="random", n_clusters=2, n_init=10, max_iter=300, random_state=42, verbose =0)
kmeans.fit(X_train)

KMeans(algorithm='auto', copy_x=True, init='random', max_iter=300, n_clusters=2,
       n_init=10, n_jobs=None, precompute_distances='auto', random_state=42,
       tol=0.0001, verbose=0)

In [15]:
kmeans.cluster_centers_

array([[0.38461538, 0.76923077, 2.        , 1.46153846],
       [0.66666667, 4.87878788, 2.45454545, 1.15151515]])

SVM

In [16]:
#Import svm model
from sklearn import svm
#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel
#Train the model using the training sets
clf.fit(X_train, y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.55


In [17]:
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))

Precision: 0.8
Recall: 0.3333333333333333


In [18]:
clf.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

Logistic Regression

In [19]:
# import the class
from sklearn.linear_model import LogisticRegression
# instantiate the model (using the default parameters)
logreg = LogisticRegression()
# fit the model with data
logreg.fit(X_train,y_train)

y_pred=logreg.predict(X_test)

In [20]:
#Confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[ 6,  2],
       [10,  2]])

In [21]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.4
Precision: 0.5
Recall: 0.16666666666666666
