<p> <font size="6.5"> Supervised Learning Examples </font> </p>


### 1. Load the needed libraries

In [1]:
from sklearn.svm import LinearSVC # for Support Vector Machine
from sklearn.ensemble import RandomForestClassifier # for Random Forest
from sklearn.ensemble import GradientBoostingClassifier # for Gradient Boosting Machine

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import warnings 
warnings.simplefilter('ignore')

### 2. Instantiate the models and check the details

In [2]:
model_SVM = LinearSVC()
model_RF = RandomForestClassifier()
model_GBM = GradientBoostingClassifier()

In [3]:
model_SVM

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [4]:
model_RF

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [5]:
model_GBM

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

### 3. Read in the data file

In [6]:
X=pd.read_csv('chat_bigram_feature.csv').values
y=pd.read_csv('chat_label.csv').values.ravel()

### 4. Create the training and validation sets

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

### 5. Train the models and generate predicted labels

In [8]:
# train the models with training set
model_SVM.fit(X_train,y_train)
model_RF.fit(X_train,y_train)
model_GBM.fit(X_train,y_train)

# -- get the predicted labels on the test dataset.
y_pred_SVM = model_SVM.predict(X_test)
y_pred_RF = model_RF.predict(X_test)
y_pred_GBM = model_GBM.predict(X_test)

### 6. Check the predictive performance

In [9]:
# calculate the accuracy of the predicted labels from SVM
accuracy_score(y_pred_SVM, y_test).round(3)

0.692

In [10]:
# calculate the accuracy of the predicted labels from Random Forest
accuracy_score(y_pred_RF, y_test).round(3)

0.644

In [11]:
# calculate the accuracy of the predicted labels from Gradient Boosting Machine
accuracy_score(y_pred_GBM, y_test).round(3)

0.642

In [12]:
pd.value_counts(y)/pd.value_counts(y).sum()

3    0.300999
1    0.271304
2    0.256546
4    0.171151
dtype: float64

***