In [1]:
import pandas as pd
import numpy as np


# Reading Data

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

## Mapping columns

In [3]:
varlist=['Male','Married','BankCustomer','EducationLevel','Ethnicity','Citizen','PriorDefault','Employed','DriversLicense','Approved']


def binary_map(x):
    return x.map({'t': 1,'a':1 ,'b':0,'f': 0,'u':0,'y':1,'l':2,'t':3,'g':0,'p':1,'gg':2,'c':0,'d':1,'cc':2,'i':3,'j':4,'k':5,'m':6,'r':7,'q':8,'w':9,'x':10,'e':11,'aa':12,'ff':13,'v':0,'h':1,'bb':2,'n':3,'z':5,'dd':6,'ff':7,'o':8,'s':3,'+':1,'-':0})


train[varlist] = train[varlist].apply(binary_map)


In [4]:
train.isnull().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590 entries, 0 to 589
Data columns (total 17 columns):
Key               590 non-null bool
Male              590 non-null bool
Age               590 non-null bool
Debt              590 non-null bool
Married           590 non-null bool
BankCustomer      590 non-null bool
EducationLevel    590 non-null bool
Ethnicity         590 non-null bool
YearsEmployed     590 non-null bool
PriorDefault      590 non-null bool
Employed          590 non-null bool
CreditScore       590 non-null bool
DriversLicense    590 non-null bool
Citizen           590 non-null bool
ZipCode           590 non-null bool
Income            590 non-null bool
Approved          590 non-null bool
dtypes: bool(17)
memory usage: 9.9 KB


## Checking Null Values

In [5]:
train.isnull().sum()

Key                0
Male              11
Age                0
Debt               0
Married            6
BankCustomer       6
EducationLevel     9
Ethnicity          9
YearsEmployed      0
PriorDefault       0
Employed           0
CreditScore        0
DriversLicense     0
Citizen            0
ZipCode            0
Income             0
Approved           0
dtype: int64

### Drop Nan

In [6]:
train=train.dropna()

In [7]:
train.isnull().sum()

Key               0
Male              0
Age               0
Debt              0
Married           0
BankCustomer      0
EducationLevel    0
Ethnicity         0
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
DriversLicense    0
Citizen           0
ZipCode           0
Income            0
Approved          0
dtype: int64

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 572 entries, 8 to 588
Data columns (total 17 columns):
Key               572 non-null int64
Male              572 non-null float64
Age               572 non-null object
Debt              572 non-null float64
Married           572 non-null float64
BankCustomer      572 non-null float64
EducationLevel    572 non-null float64
Ethnicity         572 non-null float64
YearsEmployed     572 non-null float64
PriorDefault      572 non-null int64
Employed          572 non-null int64
CreditScore       572 non-null int64
DriversLicense    572 non-null int64
Citizen           572 non-null int64
ZipCode           572 non-null object
Income            572 non-null int64
Approved          572 non-null int64
dtypes: float64(7), int64(8), object(2)
memory usage: 80.4+ KB


In [9]:
train[train['Age']=='?'].count()

Key               12
Male              12
Age               12
Debt              12
Married           12
BankCustomer      12
EducationLevel    12
Ethnicity         12
YearsEmployed     12
PriorDefault      12
Employed          12
CreditScore       12
DriversLicense    12
Citizen           12
ZipCode           12
Income            12
Approved          12
dtype: int64

In [10]:
#age has only ? so we can replace it with mean

In [11]:
train['Age']=train['Age'].replace('?',0)

In [12]:
train[train['Age']=='?'].count()

Key               0
Male              0
Age               0
Debt              0
Married           0
BankCustomer      0
EducationLevel    0
Ethnicity         0
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
DriversLicense    0
Citizen           0
ZipCode           0
Income            0
Approved          0
dtype: int64

In [13]:
train['Age']=train['Age'].astype(float)

In [14]:
c=train['Age'].mean()

In [15]:
train['Age']=train['Age'].replace(0,c)

In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 572 entries, 8 to 588
Data columns (total 17 columns):
Key               572 non-null int64
Male              572 non-null float64
Age               572 non-null float64
Debt              572 non-null float64
Married           572 non-null float64
BankCustomer      572 non-null float64
EducationLevel    572 non-null float64
Ethnicity         572 non-null float64
YearsEmployed     572 non-null float64
PriorDefault      572 non-null int64
Employed          572 non-null int64
CreditScore       572 non-null int64
DriversLicense    572 non-null int64
Citizen           572 non-null int64
ZipCode           572 non-null object
Income            572 non-null int64
Approved          572 non-null int64
dtypes: float64(8), int64(8), object(1)
memory usage: 80.4+ KB


# Droping Useless columns

In [17]:
X_train=train.drop(['Key','Approved','ZipCode'],1)

In [18]:
X_train.head(10)

Unnamed: 0,Male,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,Income
8,1.0,38.58,5.0,0.0,0.0,2.0,0.0,13.5,3,0,0,3,0,0
9,1.0,19.17,0.585,1.0,1.0,12.0,0.0,0.585,3,0,0,3,0,0
10,1.0,27.67,1.5,0.0,0.0,6.0,0.0,2.0,3,0,0,0,3,0
11,1.0,30.473549,3.5,0.0,0.0,1.0,0.0,3.0,3,0,0,3,0,0
12,1.0,49.0,1.5,0.0,0.0,4.0,4.0,0.0,3,0,0,3,0,27
13,1.0,28.58,3.54,0.0,0.0,3.0,2.0,0.5,3,0,0,3,0,0
14,1.0,22.5,11.0,1.0,1.0,8.0,0.0,3.0,3,0,0,3,0,0
15,1.0,28.5,1.0,0.0,0.0,8.0,0.0,1.0,3,3,2,3,0,500
16,1.0,25.0,11.0,1.0,1.0,12.0,0.0,4.5,3,0,0,0,0,0
17,1.0,19.75,0.75,0.0,0.0,0.0,0.0,0.795,3,3,5,3,0,5


In [19]:
y_train=train['Approved']
y_train[y_train=='-']=0
y_train[y_train=='+']=1


  result = method(y)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [20]:
y_train=y_train.astype(int)

In [21]:
y_train.unique()

array([0, 1])

In [22]:
test.head()

Unnamed: 0,Key,Male,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income
0,1,b,31.83,0.04,y,p,m,v,0.04,f,f,0,f,g,0,0
1,2,a,21.75,11.75,u,g,c,v,0.25,f,f,0,t,g,180,0
2,3,a,17.92,0.54,u,g,c,v,1.75,f,t,1,t,g,80,5
3,4,b,30.33,0.5,u,g,d,h,0.085,f,f,0,t,s,252,0
4,5,b,51.83,2.04,y,p,ff,ff,1.5,f,f,0,f,g,120,1


In [23]:
varlist=['Male','Married','BankCustomer','EducationLevel','Ethnicity','Citizen','PriorDefault','Employed','DriversLicense']

def binary_map(x):
    return x.map({'t': 1,'a':1 ,'b':0,'f': 0,'u':0,'y':1,'l':2,'t':3,'g':0,'p':1,'gg':2,'c':0,'d':1,'cc':2,'i':3,'j':4,'k':5,'m':6,'r':7,'q':8,'w':9,'x':10,'e':11,'aa':12,'ff':13,'v':0,'h':1,'bb':2,'n':3,'z':5,'dd':6,'ff':7,'o':8,'s':3,'+':1,'-':0})


test[varlist] = test[varlist].apply(binary_map)


In [24]:
test.isnull().sum()

Key               0
Male              1
Age               0
Debt              0
Married           0
BankCustomer      0
EducationLevel    0
Ethnicity         0
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
DriversLicense    0
Citizen           0
ZipCode           0
Income            0
dtype: int64

In [25]:
test=test.fillna(0)

In [26]:
for i in test.columns:
    print(test[i].unique())

[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100]
[0. 1.]
[31.83 21.75 17.92 30.33 51.83 47.17 25.83 50.25 29.5  37.33 41.58 30.58
 19.42 20.08 19.5  27.83 17.08 36.42 40.58 21.08 22.67 25.25 35.   30.83
 58.67 24.5  20.17 32.08 33.17 22.92 54.42 42.5  22.08 29.92 38.25 48.08
 45.83 36.67 28.25 23.25 21.83 19.17 25.   47.75 27.42 41.17 15.83 47.
 56.58 57.42 42.08 29.25 42.   49.5  36.75 22.58 27.25 23.   27.75 54.58
 34.17 28.92 29.67 39.58 56.42 54.33 41.   31.92 41.5  23.92 25.75 26.
 37.42 34.92 34.25 23.33 23.17 44.33 35.17 43.25 56.75 31.67 23.42 20.42
 26.67 36.   25.5  52.5  57.83 20.75 39.92 25.67]
[ 0.04  11.75   0.54   0.5  

In [27]:
X_test=test.drop(['Key','ZipCode'],1)

In [28]:
from sklearn.linear_model import LogisticRegression
rog=LogisticRegression()

In [29]:
rog.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
y_pred=rog.predict(X_test)

In [31]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [32]:
y_pred=y_pred.astype(str)
y_pred[y_pred=='1']='+'
y_pred[y_pred=='0']='-'


In [33]:
y_pred

array(['-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '+',
       '+', '+', '+', '+', '+', '+', '+', '-', '+', '-', '+', '-', '-',
       '+', '+', '+', '+', '-', '+', '+', '+', '+', '+', '+', '+', '+',
       '+', '+', '+', '+', '+', '+', '+', '+', '+', '+', '+', '+', '+',
       '+', '+', '+', '+', '+', '+', '+', '+', '-', '-', '-', '+', '+',
       '-', '+', '-', '+', '+', '-', '+', '+', '+', '+', '+', '+', '+',
       '+', '+', '+', '+', '+', '+', '+', '+', '+'], dtype='<U21')

In [34]:
key=test['Key']
key.

SyntaxError: invalid syntax (<ipython-input-34-fdd89293573c>, line 2)

In [236]:
c=pd.Series(y_pred)
d=pd.Series(key)
# c[c==0]='+'

In [237]:
c[c==0]='-'
c[c==1]='+'
c

0     -
1     -
2     -
3     -
4     -
5     -
6     -
7     -
8     -
9     -
10    -
11    -
12    -
13    -
14    -
15    -
16    -
17    -
18    -
19    -
20    -
21    -
22    -
23    -
24    -
25    +
26    +
27    +
28    +
29    +
     ..
70    +
71    +
72    +
73    -
74    -
75    -
76    +
77    +
78    -
79    +
80    -
81    +
82    +
83    -
84    +
85    +
86    +
87    +
88    +
89    +
90    +
91    +
92    +
93    +
94    +
95    +
96    +
97    +
98    +
99    +
Length: 100, dtype: object

In [238]:
prediction = pd.DataFrame(c, columns=['predictions']).to_csv('prediction.csv')