# Challenge - Chemicals Segregation

Chemical Segregation(Classification)
A chemist has two chemical flasks labelled 1 and 0 which contains two different chemicals. He extracted 3 features from these chemicals in order to distinguish between them. You are provided with the results derived by the chemist and your task is to create a model that will label chemical 0 or 1 given its three features.


Data Description
You are provided with two files test and train.

Train: This files consists of two csv files LogisticXtrain and LogisticYtrain. Xtrain consists of the features whereas Ytrain consists of the labels associated with the features.

Test: This file consists of two files LogisticXtest consisting of the features of test data and sample_output which represents in which format your solution csv must be submitted.

You need to implement any classifier from scratch, don't use any sklearn based classifier.

In [140]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

# Classifier from Scratch

In [141]:
# returning hx for each sample hence, a numpy array of size (m X 1)
def hypothesis(X,theta):
    return np.sum(X*theta[1:], axis=1).reshape((-1,1)) + theta[0] # using numpy broadcasting

In [142]:
# retuning sigmoid for each sample hence, a numpy array of size (m X 1)
def sigmoid(X,theta):
    hx = hypothesis(X,theta)
    return 1.0 / (1.0 + np.exp(-1*hx))

In [143]:
# returning scalar value as error
def negative_log_likelihood(X,Y,theta):
    g_h_x = sigmoid(X,theta)    
    log_liklihood = np.sum(Y * np.log2(g_h_x) +  (1-Y) * np.log2(1 - g_h_x) )
    return  -1 * log_liklihood

In [144]:
# return gradients w.r.t theta ,size = (n+1,)
def gradient(X,Y,theta):
    grad = np.zeros(X.shape[1] + 1)
    hx = sigmoid(X,theta)
    grad[0] = np.sum(hx - Y)
    for i in range(1, X.shape[1] + 1):
        mul = X[:,i-1].reshape((-1,1))
        grad[i] = np.sum((hx - Y)*mul)
    
    return grad

In [145]:
# goal of this function is to minimize the ``Negative of log of likelihood`` using graident descent
# code is similar to Linear Regression but hypothesis function is different

def classifier(X,Y,learning_rate=0.0001):
    # Y should have a shape with atleast two axis, otherwise use reshape((-1,1))
    theta = np.zeros(X.shape[1] + 1)
    error = []
    err = negative_log_likelihood(X,Y,theta)
    error.append(err)
    while True:        
#         print(error[-1])
        grad = gradient(X,Y,theta)
        theta = theta - learning_rate * grad
        err = negative_log_likelihood(X,Y,theta)
        if abs(err - error[-1]) < 0.0001:
            break
        error.append(err)
        
    return theta

In [146]:
def predict(X_test, theta):
    import numpy as np
    g_h_x = sigmoid(X_test,theta)
    Y_pred = []
    for i in range(X_test.shape[0]):
        if g_h_x[i] >= 0.5:
            Y_pred.append(1)
        else:            
            Y_pred.append(0)
    return np.array(Y_pred)

    
def accuracy(Y_actual, Y_predict):
    total = Y_actual.shape[0]
    
    diff = np.sum(Y_actual == Y_predict)
    return diff / total

In [147]:
# loading data
X = pd.read_csv('Datasets/Assignment3_Logistic_X_Train.csv')
Y = pd.read_csv('Datasets/Assignment3_Logistic_Y_Train.csv')
test = pd.read_csv('Datasets/Assignment3_Logistic_X_Test.csv')

In [148]:
X.head()

Unnamed: 0,f1,f2,f3
0,-1.239375,0.749101,-0.528515
1,-1.03607,0.801436,-1.283712
2,-0.615579,1.579521,-1.391927
3,1.335978,1.348651,1.433564
4,0.658925,1.300019,0.571603


In [149]:
X.shape

(3000, 3)

In [150]:
Y.shape

(3000, 1)

In [151]:
test.shape

(1000, 3)

In [152]:
# preprocessing
from sklearn.preprocessing import StandardScaler
s = StandardScaler()
s.fit_transform(X)
s.transform(test)

array([[-0.58096727, -1.23196981, -2.10386172],
       [-0.93642077,  0.43931113, -0.36596031],
       [ 1.43773502,  0.91090225,  1.08581064],
       ...,
       [-0.89542577, -0.05073748, -0.67463414],
       [-0.40465638,  0.8263143 , -0.84593545],
       [ 0.39940977, -0.40861529,  0.29533219]])

In [153]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=11)

In [154]:
theta = classifier(X_train.values,Y_train.values)

In [155]:
theta

array([-4.65946819, -4.20626944,  3.45533749, -2.4243403 ])

In [156]:
Y_pred = predict(X_test.values, theta)

In [157]:
Y_pred.shape

(750,)

In [158]:
acc = accuracy(Y_test,Y_pred.reshape((-1,1)))

In [159]:
acc

label    0.996
dtype: float64

In [160]:
# output file 

In [161]:
output = predict(test.values,theta)
output.dtype

dtype('int32')

In [162]:
output.shape

(1000,)

In [163]:
df = pd.DataFrame(output)

In [164]:
df.columns = ['label']

In [165]:
df.head()

Unnamed: 0,label
0,1
1,1
2,0
3,1
4,1


In [166]:
df.to_csv('Datasets/Assignment3_Logistic_scratch_output.csv',index=False)

In [167]:
df = pd.read_csv('Datasets/Assignment3_Logistic_scratch_output.csv')

In [168]:
df

Unnamed: 0,label
0,1
1,1
2,0
3,1
4,1
5,1
6,0
7,1
8,0
9,0


## using SKlearn

In [169]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [170]:
# loading data
X = pd.read_csv('Datasets/Assignment3_Logistic_X_Train.csv')
Y = pd.read_csv('Datasets/Assignment3_Logistic_Y_Train.csv')
test = pd.read_csv('Datasets/Assignment3_Logistic_X_Test.csv')

In [171]:
# preprocessing
from sklearn.preprocessing import StandardScaler
s = StandardScaler()
s.fit_transform(X)
s.transform(test)

array([[-0.58096727, -1.23196981, -2.10386172],
       [-0.93642077,  0.43931113, -0.36596031],
       [ 1.43773502,  0.91090225,  1.08581064],
       ...,
       [-0.89542577, -0.05073748, -0.67463414],
       [-0.40465638,  0.8263143 , -0.84593545],
       [ 0.39940977, -0.40861529,  0.29533219]])

In [172]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)

In [173]:
Y_train = np.ravel(Y_train)

In [174]:
Y_train.shape

(2250,)

In [175]:
from sklearn.linear_model import LogisticRegression

In [176]:
model = LogisticRegression(solver='lbfgs')

In [177]:
model.fit(X_train,Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [178]:
model.coef_

array([[-3.74054432,  3.00706952, -2.20505995]])

In [179]:
model.intercept_

array([-3.94248248])

In [180]:
Y_test  = np.ravel(Y_test)

In [181]:
Y_test.shape

(750,)

In [182]:
Y_pred = model.predict(X_test)

In [183]:
Y_pred.shape

(750,)

In [184]:
output = model.predict(test)

In [185]:
output.shape

(1000,)

In [186]:
output

array([1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,

In [187]:
model.score(X_test,Y_test)

0.9946666666666667

In [188]:
df = pd.DataFrame(output)

In [189]:
df.columns = ['label']

In [190]:
df.to_csv('Datasets/Assignment3_Logistic_output.csv',index=False)

In [191]:
df = pd.read_csv('Datasets/Assignment3_Logistic_output.csv')

In [192]:
df

Unnamed: 0,label
0,1
1,1
2,0
3,1
4,1
5,1
6,0
7,1
8,0
9,0
