### Import

In [0]:
import pandas as pd
import numpy as np

### Data

In [2]:
data = pd.read_excel('dataset.xlsx')
data.head()

Unnamed: 0,Date Of Test,Signal (dBm),Speed (m/s),Distance from site (m),Call Test Duration (s),Call Test Result,Call Test Technology,Call Test Setup Time (s),MOS
0,2017-07-01 00:00:27,-61.0,68.800003,1048.6,90.0,SUCCESS,UMTS,0.56,2.1
1,2017-07-01 00:02:57,-61.0,68.769997,1855.54,90.0,SUCCESS,UMTS,0.45,3.2
2,2017-07-01 00:05:29,-71.0,69.169998,1685.62,90.0,SUCCESS,UMTS,0.51,2.1
3,2017-07-01 00:08:02,-65.0,69.279999,1770.92,90.0,SUCCESS,UMTS,0.0,1.0
4,2017-07-01 00:10:30,-103.0,0.82,256.07,60.0,SUCCESS,UMTS,3.35,3.6


In [6]:
data['MOS'].unique()

array([2.1, 3.2, 1. , 3.6, 4.4, 3.7, 4.3, 3.8, 1.7, 2. , 3.3, 2.3, 2.7,
       3.1, 2.2, 1.4, 1.2, 4. , 2.4, 4.1, 1.3, 1.8, 2.5, 3.4, 4.2, 3. ,
       2.6, 3.5, 1.6, 2.9, 2.8, 1.5, 1.9, 3.9, 1.1])

In [63]:
data.isnull().any()

Date Of Test                False
Signal (dBm)                 True
Speed (m/s)                 False
Distance from site (m)       True
Call Test Duration (s)      False
Call Test Result            False
Call Test Technology        False
Call Test Setup Time (s)    False
MOS                         False
dtype: bool

In [0]:
# Take 10k random samples from the dataset

data2 = data.sample(n=10000, random_state=1)

### Split x and y

In [65]:
x_features = data2.iloc[:,1:-1]
x_features

Unnamed: 0,Signal (dBm),Speed (m/s),Distance from site (m),Call Test Duration (s),Call Test Result,Call Test Technology,Call Test Setup Time (s)
70677,-105.0,4.290000,582.63,60.0,SUCCESS,LTE,1.26
42450,-81.0,0.000000,512.91,90.0,SUCCESS,UMTS,4.29
59832,-71.0,0.000000,341.47,60.0,SUCCESS,UMTS,4.12
40660,-55.0,0.514004,42.24,90.0,SUCCESS,UMTS,3.17
47888,-81.0,-1.000000,497.16,90.0,SUCCESS,UMTS,3.75
...,...,...,...,...,...,...,...
79353,-61.0,1.126144,549.26,90.0,SUCCESS,UMTS,3.89
65782,-71.0,0.000000,424.23,90.0,SUCCESS,UMTS,3.39
89563,-65.0,0.000000,634.23,90.0,SUCCESS,UMTS,3.42
22529,-75.0,0.000000,482.38,900.0,SUCCESS,UMTS,3.61


In [66]:
y = data2['MOS']
y

70677    4.3
42450    2.1
59832    4.4
40660    2.1
47888    2.1
        ... 
79353    4.4
65782    2.7
89563    2.1
22529    2.0
15566    3.4
Name: MOS, Length: 10000, dtype: float64

### Fill null values

In [67]:
x_features.isnull().any()

Signal (dBm)                 True
Speed (m/s)                 False
Distance from site (m)       True
Call Test Duration (s)      False
Call Test Result            False
Call Test Technology        False
Call Test Setup Time (s)    False
dtype: bool

In [0]:
# Fill null values in Signal
x_features['Signal (dBm)'].describe()

x_features['Signal (dBm)'].fillna(np.mean(x_features['Signal (dBm)']), inplace=True)

In [0]:
# Same for distance
x_features['Distance from site (m)'].describe()

x_features['Distance from site (m)'].fillna(np.mean(x_features['Distance from site (m)']), inplace=True)

### Call Test Results

In [70]:
x_features['Call Test Result'].unique()

array(['SUCCESS', 'FAILURE - DROP CALL', 'FAILURE - SETUP FAIL'],
      dtype=object)

In [0]:
x_features['Call Test Result'].replace('SUCCESS', 2, inplace=True)
x_features['Call Test Result'].replace('FAILURE - DROP CALL', 1, inplace=True)
x_features['Call Test Result'].replace('FAILURE - SETUP FAIL', 0, inplace=True)

In [72]:
x_features['Call Test Result'].unique()

array([2, 1, 0])

### Call Test Tech.

In [73]:
x_features['Call Test Technology'].unique()

array(['LTE', 'UMTS', 'GSM'], dtype=object)

In [0]:
x_features['Call Test Technology'].replace('LTE', 2, inplace=True)
x_features['Call Test Technology'].replace('UMTS', 1, inplace=True)
x_features['Call Test Technology'].replace('GSM', 0, inplace=True)

In [75]:
x_features['Call Test Technology'].unique()

array([2, 1, 0])

### Y

In [76]:
y.unique()

array([4.3, 2.1, 4.4, 2.7, 3.6, 4.2, 1.3, 1.6, 1.7, 1. , 2.2, 2.3, 3.1,
       3.2, 1.8, 1.5, 2.9, 3.3, 1.2, 3.4, 2.6, 3.8, 3. , 1.4, 3.7, 3.9,
       2.4, 2.8, 1.9, 2. , 4. , 2.5, 4.1, 1.1, 3.5])

In [0]:
y2 = y.astype('str')

### TTS

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
x_train, x_test, y_train, y_test = train_test_split(x_features, y2, random_state=1, test_size=.2)

### Random Forest Classifier

In [0]:
from sklearn.ensemble import RandomForestClassifier as rfc

In [81]:
x_train

Unnamed: 0,Signal (dBm),Speed (m/s),Distance from site (m),Call Test Duration (s),Call Test Result,Call Test Technology,Call Test Setup Time (s)
46189,-71.0,6.500000,230.680000,60.0,2,1,0.00
95506,-104.0,-1.000000,7142.691922,90.0,2,2,0.83
85257,-81.0,0.200000,187.580000,90.0,2,2,0.80
204,-106.0,0.000000,828.650000,90.0,2,2,0.47
29089,-69.0,0.000000,228.880000,90.0,2,1,3.07
...,...,...,...,...,...,...,...
8943,-94.0,0.000000,7142.691922,90.0,2,2,0.51
76805,-93.0,0.500000,7142.691922,90.0,2,2,0.63
9141,-75.0,1.160000,556.440000,60.0,2,1,4.02
104295,-53.0,21.730057,3639.820000,90.0,2,1,4.70


In [100]:
classifier = rfc(max_depth=11)

classifier.fit(x_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=11, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [101]:
classifier.score(x_test, y_test)

0.472

In [98]:
accuracy_list = []

for loop in range(5,21):

  classifier = rfc(max_depth=loop)

  classifier.fit(x_train, y_train)
  accuracy_list.append((classifier.score(x_test, y_test), loop))



In [99]:
accuracy_list

[(0.4605, 5),
 (0.4575, 6),
 (0.4605, 7),
 (0.469, 8),
 (0.48, 9),
 (0.4835, 10),
 (0.4855, 11),
 (0.4635, 12),
 (0.4755, 13),
 (0.4715, 14),
 (0.4605, 15),
 (0.4555, 16),
 (0.4635, 17),
 (0.46, 18),
 (0.454, 19),
 (0.4435, 20)]

In [102]:
x_features

Unnamed: 0,Signal (dBm),Speed (m/s),Distance from site (m),Call Test Duration (s),Call Test Result,Call Test Technology,Call Test Setup Time (s)
70677,-105.0,4.290000,582.63,60.0,2,2,1.26
42450,-81.0,0.000000,512.91,90.0,2,1,4.29
59832,-71.0,0.000000,341.47,60.0,2,1,4.12
40660,-55.0,0.514004,42.24,90.0,2,1,3.17
47888,-81.0,-1.000000,497.16,90.0,2,1,3.75
...,...,...,...,...,...,...,...
79353,-61.0,1.126144,549.26,90.0,2,1,3.89
65782,-71.0,0.000000,424.23,90.0,2,1,3.39
89563,-65.0,0.000000,634.23,90.0,2,1,3.42
22529,-75.0,0.000000,482.38,900.0,2,1,3.61


In [0]:
array_ = [-100,4,500,60,2,2,1]
datafr = pd.DataFrame(np.array([[-100,4,500,60,2,2,1]]), columns=x_features.columns)

In [147]:
classifier.predict(datafr)

array(['4.4'], dtype=object)