In [34]:
# Importing the required libraries
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

In [35]:
# Looking the directory
FILE_PATH = "./dataset"
FILES = os.listdir(FILE_PATH)
print("Dataset files:", FILES)

Dataset files: ['0.csv', '1.csv', '2.csv', '3.csv']


In [36]:
# Loading the csv's
data_frames = []
for fileName in FILES:
    data_frames.append(pd.read_csv(FILE_PATH + f"/{fileName}", header=None))
print(data_frames.__len__())

4


In [37]:
# Reading the first 2 rows
for i, df in enumerate(data_frames):
    print(f"Data frame {i}")
    print(df.head(2))

Data frame 0
     0    1    2    3     4     5      6     7     8    9   ...    55    56  \
0  26.0  4.0  5.0  8.0  -1.0 -13.0 -109.0 -66.0  -9.0  2.0  ... -28.0  61.0   
1 -47.0 -6.0 -5.0 -7.0  13.0  -1.0   35.0 -10.0  10.0 -4.0  ... -25.0  47.0   

    57   58   59    60    61     62    63  64  
0  4.0  8.0  5.0   4.0  -7.0  -59.0  16.0   0  
1  6.0  6.0  5.0  13.0  21.0  111.0  15.0   0  

[2 rows x 65 columns]
Data frame 1
    0    1    2    3     4     5    6    7     8    9   ...   55    56   57  \
0 -7.0 -1.0 -1.0  0.0 -10.0 -10.0 -1.0  1.0  -5.0 -5.0  ...  6.0  -4.0 -3.0   
1 -6.0 -2.0 -5.0 -2.0  27.0  42.0  3.0  5.0  11.0  1.0  ...  2.0 -17.0 -5.0   

    58   59    60    61   62   63  64  
0 -5.0 -3.0  15.0  11.0 -4.0 -5.0   1  
1 -7.0 -2.0  15.0  12.0  0.0 -7.0   1  

[2 rows x 65 columns]
Data frame 2
    0     1     2    3     4     5    6     7    8     9   ...   55    56  \
0  4.0  19.0  -9.0 -7.0  -3.0 -36.0 -6.0 -23.0  3.0 -21.0  ...  9.0 -14.0   
1 -1.0  12.0  20.0  7

In [38]:
# Creating a combined dataframe
df = pd.concat(data_frames[0:])
print(df.head(2))

     0    1    2    3     4     5      6     7     8    9   ...    55    56  \
0  26.0  4.0  5.0  8.0  -1.0 -13.0 -109.0 -66.0  -9.0  2.0  ... -28.0  61.0   
1 -47.0 -6.0 -5.0 -7.0  13.0  -1.0   35.0 -10.0  10.0 -4.0  ... -25.0  47.0   

    57   58   59    60    61     62    63  64  
0  4.0  8.0  5.0   4.0  -7.0  -59.0  16.0   0  
1  6.0  6.0  5.0  13.0  21.0  111.0  15.0   0  

[2 rows x 65 columns]


In [39]:
X = df.iloc[:, :-1].values  # Features
y = df.iloc[:, -1].values  # labels

In [40]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [41]:
# scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [42]:
print(X_train)

[[-0.02992967 -0.44025523 -1.05772552 ...  0.64174639  0.3339397
  -0.15716195]
 [ 0.3439229   0.57419475 -0.45305449 ... -0.69006835  0.17635129
  -0.79587394]
 [ 0.18370037  0.23604476  0.95784456 ...  2.20858727  0.49152811
  -0.85974513]
 ...
 [ 2.90748342  0.32058226  1.15940157 ...  0.56340435  1.12188174
   0.48155004]
 [-0.1901522   0.06696976 -0.04994048 ...  3.14869179  0.64911652
   0.22606525]
 [ 0.02347784  0.06696976  0.35317354 ... -0.4158712  -2.18747484
  -1.17910113]]


In [43]:
classifier = RandomForestClassifier(n_estimators=20, random_state=0, criterion='entropy')
classifier.fit(X_train, y_train)

In [44]:
# Predicting
y_pred = classifier.predict(X_test)
print(y_pred[0:5])

[2 2 3 3 1]


In [45]:
# confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[689   0   9   8]
 [  0 651  21  37]
 [ 10  13 689  34]
 [ 49  36  52 622]]


In [47]:
accuracies = cross_val_score(estimator=classifier, X=X, y=y, cv=10)
print(accuracies.mean())
print(accuracies.std())

print("Test set classification rate: {}".format(np.mean(y_pred == y_test)))

0.9054702814851335
0.05796461854381617
Test set classification rate: 0.9078767123287671
