In [18]:
import pandas as pd, numpy as np, random
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

### Load Dataset

In [19]:
test_df = pd.read_csv("test.csv")
train_df = pd.read_csv("train.csv")

In [20]:
train_df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [21]:
train_df['isMale']=(train_df['Gender']=='Male').astype(int)
test_df['isMale']=(train_df['Gender']=='Male').astype(int)

In [22]:
one_hot_geo = pd.get_dummies(train_df['Geography'], prefix='Geo')
train_df = train_df.join(one_hot_geo)
one_hot_geo = pd.get_dummies(test_df['Geography'], prefix='Geo')
test_df = test_df.join(one_hot_geo)

In [23]:
train_df["CreditScore"].value_counts(dropna=False)

CreditScore
850    2532
678    2299
684    1718
667    1658
705    1605
       ... 
419       1
386       1
358       1
423       1
373       1
Name: count, Length: 457, dtype: int64

## Logistic Reg

In [24]:
train_df.dtypes

id                   int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                float64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard          float64
IsActiveMember     float64
EstimatedSalary    float64
Exited               int64
isMale               int32
Geo_France            bool
Geo_Germany           bool
Geo_Spain             bool
dtype: object

In [25]:
estimators=['CreditScore', 'Age', 'Tenure', 'Balance', 
             'NumOfProducts', 'HasCrCard', 'IsActiveMember', 
             'EstimatedSalary','isMale',
             'Geo_France','Geo_Germany', 'Geo_Spain']

X = train_df[estimators]
Y = train_df['Exited']

In [26]:
model = LogisticRegression()
model.fit(X, Y)

## Prediction

In [31]:
X_test = test_df[estimators]

In [43]:
y_pred = model.predict_proba(X_test)
prob_exited = y_pred[:,1]
pred = pd.Series(prob_exited, name='Exited').to_frame(name='Exited')

In [44]:
result_df=pd.concat([test_df['id'], pred], axis=1)

In [45]:
result_df.to_csv('output.csv', index=False)

In [46]:
unif_df=result_df
unif_df['Exited']=0.5
unif_df.to_csv('unif.csv', index=False)