# Importing libraries and dataset

In [64]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import cupy as np # linear algebra
import cudf as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
print(os.listdir("../input"))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [65]:
# read file
voice=pd.read_csv('../input/voicecsv/voice.csv')
voice.head()

In [66]:
features = voice.columns

# Checking for null values and feature properties

In [67]:
voice.info()

In [68]:
# import pandas
# voice = voice.to_pandas()

# Encoding data

In [69]:
from cuml.preprocessing import LabelEncoder
le = LabelEncoder()
voice['label'] = le.fit_transform(voice['label'])

In [70]:
voice

In [71]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.subplots(4,5,figsize=(15,15))
for i in range(1,21):
    plt.subplot(4,5,i)
    plt.title(voice.columns[i-1])
    sns.kdeplot(voice.loc[voice['label'] == 0, voice.columns[i-1]].to_pandas(), color= 'green', label='F')
    sns.kdeplot(voice.loc[voice['label'] == 1, voice.columns[i-1]].to_pandas(), color= 'blue', label='M')

In [72]:
y = voice['label']
X = voice.drop(['label'], axis = 1)
# y.values.reshape(-1,1)

In [73]:
y = pd.Series(y)
type(y)

In [74]:
from cuml.preprocessing import LabelEncoder
le = LabelEncoder()
y  = le.fit_transform(y)

# Normalisation

In [75]:
# normalization of x and obtaining x
#min max normalisation because data is not normal
from cuml.preprocessing import MinMaxScaler
mm_scaler = MinMaxScaler()
X = mm_scaler.fit_transform(X)
# mm_scaler.transform(X_test)

# X = (X-np.min(X))/(np.max(X)-np.min(X))
# X.head()

In [76]:
X.columns = features[:20]
X

# Checking for correlation & Splitting dataset

In [77]:
#always check for correlation before anything. If corr is high, then only select/reduce features
import seaborn as sns
#gives coloured range acc to values
#.corr is a method in pandas, in numpy it is corcoeff
sns.heatmap(X.corr().to_pandas())

In [78]:
#Dividing the data randomly into training and test set
from cuml.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

# Decision Tree (Sklearn)

In [79]:
#Find the best parameter to prune the tree
from sklearn.tree import DecisionTreeClassifier
# from cuml.ensemble import RandomForestClassifier as cuRF
def dt_error(n,X_train,y_train,X_test,y_test):
    nodes = range(2, n)
    error_rate = []
    for k in nodes:
        model = DecisionTreeClassifier(max_leaf_nodes=k)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        error_rate.append(np.mean(y_pred != y_test))
    kloc = error_rate.index(min(error_rate))
    print("Lowest error is %s occurs at n=%s." % (error_rate[kloc], nodes[kloc]))
    plt.plot(nodes, error_rate)
    plt.xlabel('Tree Size')
    plt.ylabel('Cross-Validated MSE')
    plt.show()
    return nodes[kloc]

In [80]:
n = dt_error(30,X_train.to_pandas(),y_train.to_pandas(),X_test.to_pandas(),y_test.to_pandas())

In [81]:
#prune tree
from sklearn import tree
pruned_tree = DecisionTreeClassifier(criterion = 'gini', max_leaf_nodes = n)

# criterion entropy is giiving better precision with female (98) but lower precision with male (94) category
# thus we avoid using it due to possible overfitting

In [82]:
from sklearn.metrics import classification_report
target_names = ['female', 'male']
pruned_tree.fit(X_train.to_pandas(),y_train.to_pandas())
y_pred = pruned_tree.predict(X_test.to_pandas())
print(classification_report(y_test.to_pandas(), y_pred, target_names=target_names, digits = 5))

In [83]:
from sklearn.ensemble import AdaBoostClassifier
boosted_dt=AdaBoostClassifier(pruned_tree ,algorithm='SAMME', n_estimators=200, learning_rate=0.5)
boosted_dt.fit(X_train.to_pandas(),y_train.to_pandas())
y_boost_pred = boosted_dt.predict(X_test.to_pandas())

In [84]:
print(classification_report(y_test.to_pandas(), y_boost_pred, target_names=target_names, digits = 5))

# Logistic Regression

In [85]:
from cuml.linear_model import LogisticRegression as cuLR
from cuml.metrics import accuracy
cuml_reg_model = cuLR( fit_intercept = True )
trained_LR = cuml_reg_model.fit( X_train, y_train )
y_lr_pred = trained_LR.predict( X_test )
print(accuracy.accuracy_score(y_test, y_lr_pred))

# Random Forest Classifier

In [86]:
import numpy
from cuml.ensemble import RandomForestClassifier as cuRFC

cuml_model = cuRFC(n_estimators=5)
cuml_model.fit(X_train,y_train)
y_rf_pred = cuml_model.predict(X_test)
cu_score = accuracy.accuracy_score( y_test, y_rf_pred )
print(cu_score)