In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
df_train = pd.read_csv('data_semantics_training.csv')
df_train.head()

Unnamed: 0,serial_no,C1,C2,C3,C4,C5,C6,C7,C8
0,1,1,1,15200,80,6.5,24.0,Healthy,Live
1,2,2,2,26100,90,6.75,30.0,Healthy,Closed
2,3,3,3,22400,80,7.5,36.0,Defaulted,
3,4,4,4,21600,90,7.5,42.0,Defaulted,
4,5,5,5,44000,100,10.0,60.0,Healthy,Live


In [3]:
df_train.columns

Index(['serial_no', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8'], dtype='object')

In [4]:
## Getting all null columns
df_train_null = df_train[df_train['C5'].isnull() & df_train['C6'].isnull() & df_train['C8'].isnull()]

In [5]:
## Getting not null columns
df_train_notnull = df_train[df_train['C5'].notnull() & df_train['C6'].notnull() & df_train['C8'].notnull()]

In [6]:
df_train_notnull.head()

Unnamed: 0,serial_no,C1,C2,C3,C4,C5,C6,C7,C8
0,1,1,1,15200,80,6.5,24.0,Healthy,Live
1,2,2,2,26100,90,6.75,30.0,Healthy,Closed
4,5,5,5,44000,100,10.0,60.0,Healthy,Live
5,6,6,6,11200,40,4.25,12.0,Healthy,Closed
6,7,7,7,16800,80,5.5,18.0,Healthy,Closed


In [7]:
df_train_notnull.isna().sum()

serial_no    0
C1           0
C2           0
C3           0
C4           0
C5           0
C6           0
C7           0
C8           0
dtype: int64

In [8]:
df_train_x = df_train_notnull.loc[:,['C3','C4','C5','C6','C8']]
df_train_x.head()

Unnamed: 0,C3,C4,C5,C6,C8
0,15200,80,6.5,24.0,Live
1,26100,90,6.75,30.0,Closed
4,44000,100,10.0,60.0,Live
5,11200,40,4.25,12.0,Closed
6,16800,80,5.5,18.0,Closed


In [9]:
df_train_y = df_train_notnull.loc[:,['C7']]


In [10]:
df_train_y.head()

Unnamed: 0,C7
0,Healthy
1,Healthy
4,Healthy
5,Healthy
6,Healthy


In [11]:
df_train_y['C7'].value_counts()

Healthy      580
Unhealthy     22
Name: C7, dtype: int64

In [12]:
le = LabelEncoder()


In [13]:
y_train = le.fit_transform(df_train_y['C7'])

In [14]:
y_train.shape,df_train_x.shape

((602,), (602, 5))

In [15]:
df_train_x = pd.get_dummies(df_train_x,columns=['C8'],prefix='C8')


In [16]:
df_train_x.head()

Unnamed: 0,C3,C4,C5,C6,C8_Closed,C8_Live
0,15200,80,6.5,24.0,0,1
1,26100,90,6.75,30.0,1,0
4,44000,100,10.0,60.0,0,1
5,11200,40,4.25,12.0,1,0
6,16800,80,5.5,18.0,1,0


### Feature Scaling

In [None]:
from sklearn import preprocessing

In [None]:
df_train_x['C3'] = preprocessing.scale(df_train_x['C3'],axis=1)


In [None]:
df_train_x['C4'] = preprocessing.scale(df_train_x['C4'],axis=1)

In [None]:
df_train_x['C5'] = preprocessing.scale(df_train_x['C5'],axis=1)

In [None]:
df_train_x['C6'] = preprocessing.scale(df_train_x['C6'],axis=1)

In [17]:
## Training Logistic regression.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import tree

In [18]:
## Train test split
x_train_,x_test_,y_train_,y_test_ = train_test_split(df_train_x,y_train)

In [19]:

#clf = LogisticRegression()

#clf = SVC(C=1,gamma='auto',random_state=42)

clf = tree.DecisionTreeClassifier()

clf.fit(x_train_,y_train_)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [50]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(x_train_,y_train_)
Y_pred = random_forest.predict(x_test_)
random_forest.score(x_train_, y_train_)
acc_random_forest = round(random_forest.score(x_train_, y_train_) * 100, 2)
acc_random_forest

99.33

In [51]:
y_predict = clf.predict(x_test_)

In [52]:
y_predict = np.array(y_predict)
y_test_ = np.array(y_test_)

In [22]:
y_predict.shape,y_test_.shape

((151,), (151,))

In [23]:
from sklearn.metrics import accuracy_score

accuracy_score(y_predict,y_test_)

0.9668874172185431

## Working on Test data

In [24]:
df_test = pd.read_csv('data_sematics_test.csv')

In [25]:
df_test.head()

Unnamed: 0,serial_no,C1,C2,C3,C4,C5,C6,C8
0,1,700,700,16800,80,6.5,24.0,Closed
1,2,701,701,0,0,,,
2,3,702,702,15200,80,6.5,18.0,Closed
3,4,703,703,14500,50,4.75,18.0,Closed
4,5,704,704,16800,80,7.75,24.0,Closed


In [26]:
df_test_x = df_test.loc[:,['C3','C4','C5','C6','C8']]

In [27]:
df_test_x.head(20)

Unnamed: 0,C3,C4,C5,C6,C8
0,16800,80,6.5,24.0,Closed
1,0,0,,,
2,15200,80,6.5,18.0,Closed
3,14500,50,4.75,18.0,Closed
4,16800,80,7.75,24.0,Closed
5,57600,90,11.75,30.0,Live
6,22400,80,7.25,18.0,Live
7,15200,80,6.5,18.0,Closed
8,16800,80,6.75,18.0,Closed
9,25200,90,7.5,18.0,


In [28]:
## Getting not null columns
df_test_notnull = df_test_x[df_test_x['C5'].notnull() & df_test_x['C6'].notnull() & df_test_x['C8'].notnull()]

In [29]:
## Getting null columns
df_test_isnull = df_test_x[df_test_x['C5'].isnull() & df_test_x['C6'].isnull() & df_test_x['C8'].isnull()]

In [30]:
df_test_x.shape,df_test_notnull.shape,df_test_isnull.shape

((301, 5), (248, 5), (7, 5))

In [31]:
df_test_notnull = pd.get_dummies(df_test_notnull,columns=['C8'],prefix='C8')

In [32]:
df_test_notnull.head()

Unnamed: 0,C3,C4,C5,C6,C8_Closed,C8_Live
0,16800,80,6.5,24.0,1,0
2,15200,80,6.5,18.0,1,0
3,14500,50,4.75,18.0,1,0
4,16800,80,7.75,24.0,1,0
5,57600,90,11.75,30.0,0,1


In [53]:
y_test = random_forest.predict(df_test_notnull)

In [54]:
y_test

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

In [55]:
y_test = le.inverse_transform(y_test)

In [56]:
y_test

array(['Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Unhealthy',
       'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Healthy', 'Healthy', 'Healthy'

In [57]:
#y_test.unique
np.unique(y_test)

array(['Healthy', 'Unhealthy'], dtype=object)

In [58]:
df_test_x.loc[0,['C5']].isnull()[0]

False

In [59]:
df_test_x['C7'] = pd.Series()

In [60]:
t=0
for i in range(len(df_test_x)):
    if df_test_x.loc[i,['C5']].notnull()[0] & df_test_x.loc[i,['C6']].notnull()[0] & df_test_x.loc[i,['C8']].notnull()[0]:
        df_test_x.loc[i,['C7']]=y_test[t]
        t+=1

df_test_x.head()

Unnamed: 0,C3,C4,C5,C6,C8,C7
0,16800,80,6.5,24.0,Closed,Healthy
1,0,0,,,,
2,15200,80,6.5,18.0,Closed,Healthy
3,14500,50,4.75,18.0,Closed,Healthy
4,16800,80,7.75,24.0,Closed,Healthy


In [61]:
for i in range(len(df_test_x)):
    if df_test_x.loc[i,['C5']].isnull()[0] & df_test_x.loc[i,['C6']].isnull()[0] & df_test_x.loc[i,['C8']].isnull()[0]:
        df_test_x.loc[i,['C7']]='None'

df_test_x.head()

Unnamed: 0,C3,C4,C5,C6,C8,C7
0,16800,80,6.5,24.0,Closed,Healthy
1,0,0,,,,
2,15200,80,6.5,18.0,Closed,Healthy
3,14500,50,4.75,18.0,Closed,Healthy
4,16800,80,7.75,24.0,Closed,Healthy


In [62]:
df_test.shape,df_test_x.shape

((301, 8), (301, 6))

In [63]:
df_sub = pd.DataFrame(index=None,columns=['serial_no','C7'])
df_sub['serial_no'] = df_test['serial_no']
df_sub['C7'] = df_test_x['C7']
df_sub.head()

Unnamed: 0,serial_no,C7
0,1,Healthy
1,2,
2,3,Healthy
3,4,Healthy
4,5,Healthy


In [64]:
df_sub.head(20)

Unnamed: 0,serial_no,C7
0,1,Healthy
1,2,
2,3,Healthy
3,4,Healthy
4,5,Healthy
5,6,Healthy
6,7,Unhealthy
7,8,Healthy
8,9,Healthy
9,10,


In [65]:
df_sub.fillna('Defaulted',inplace=True)

In [66]:
df_sub.to_csv('submission.csv',index=False)

## Started working on training data.

In [None]:
df_test = pd.read_csv('data_semantics_training.csv')

In [None]:
df_train_x.corr()

In [None]:
df_train['C8'].value_counts()

In [None]:
df_train['C7'].value_counts()

In [None]:
x_train = df_train.loc[:,['C3','C4','C5','C6','C8']]
x_train.head()

In [None]:
y_train = df_train.loc[:,['C7']]
y_train.head()

In [None]:
## Fill missing value in C8
df_train['C8'].fillna('Closed', inplace=True)

In [None]:
x_train.isna().sum()

In [None]:
x_train.describe()

In [None]:
print(x_train['C5'].mean(),x_train['C5'].median(),x_train['C5'].mode())

In [None]:
x_train['C5'].fillna(x_train['C5'].median(),inplace=True)

In [None]:
plt.hist(x_train['C5'])

In [None]:
plt.hist(x_train['C6'])

In [None]:
plt.hist(x_train['C3'])

### Doing feature Scaling

In [None]:
x_train['C3'].std()