In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing, tree, svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [2]:
# Importing the input and target data sets
inputs = pd.read_csv('DataminingContest2009.Task2Inputs.Train.csv')
target = pd.read_csv('DataminingContest2009.Task2Targets.Train.csv')

# Concatenating the zip and state columns to reduce model bias
inputs["zipstate"] = inputs["zip1"].map(str) + inputs["state1"]

# Subset the input variables to only the most important
inputs = inputs[['field3', 'flag5', 'field4', 'zipstate']]

In [3]:
print(inputs.info())
print('-------------')
print(target.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
field3      100000 non-null int64
flag5       100000 non-null int64
field4      100000 non-null int64
zipstate    99999 non-null object
dtypes: int64(3), object(1)
memory usage: 3.1+ MB
None
-------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 1 columns):
fraud    100000 non-null int64
dtypes: int64(1)
memory usage: 781.3 KB
None


In [4]:
# Inputting the state missing value with the most common state
inputs['zipstate'] = inputs['zipstate'].fillna(inputs['zipstate'].mode()[0])

In [5]:
# Converting zipstate values to integers
lbl = preprocessing.LabelEncoder()

lbl.fit(np.unique(list(inputs['zipstate'].values)))
inputs['zipstate'] = lbl.transform(list(inputs['zipstate'].values))

In [6]:
print(inputs.head())

   field3  flag5  field4  zipstate
0    -723      1      19      1212
1    5497      1      14      1201
2   -4420      1      23       432
3    5010      1      31      1155
4   -4074      1      21       285


In [7]:
# Convert our data frame to multidimensional arrays
# The X variable is an array of the independent variables and drops column 'y'
X = np.array(inputs)
# X = preprocessing.scale(X)

# The y variable is an array of the dependent variable 'y'
y = np.array(target)

In [8]:
# Shuffle and partition our data into 80% train data and 20% test data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [9]:
# Choosing which model to use for our data
clf = RandomForestClassifier(n_estimators=500)
# clf = LogisticRegression()
# clf = tree.DecisionTreeClassifier()

# Training the data
clf.fit(X_train, y_train)

# Scoring the model
accuracy = clf.score(X_test, y_test)

# Printing the score of the model
print(accuracy)



0.98404


In [10]:
# Predicting the test data
y_pred = clf.predict(X_test)
y_test.resize(len(y_pred))

# Displaying a confusion matrix
df_confusion = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
print(df_confusion)

Predicted      0    1
Actual               
0          24235  135
1            264  366
