In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing, tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [2]:
# Importing the input and target data sets
inputs = pd.read_csv('DataminingContest2009.Task2Inputs.Train.csv')
target = pd.read_csv('DataminingContest2009.Task2Targets.Train.csv')

# Dropping the two customer attribute columns as they won't provide value to the model
inputs = inputs.drop(['custAttr1', 'custAttr2'], 1)

In [3]:
print(inputs.info())
print('-------------')
print(target.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 17 columns):
amount        100000 non-null float64
hour1         100000 non-null int64
state1        99999 non-null object
zip1          100000 non-null int64
field1        100000 non-null int64
field2        100000 non-null int64
hour2         100000 non-null int64
flag1         100000 non-null int64
total         100000 non-null float64
field3        100000 non-null int64
field4        100000 non-null int64
indicator1    100000 non-null int64
indicator2    100000 non-null int64
flag2         100000 non-null int64
flag3         100000 non-null int64
flag4         100000 non-null int64
flag5         100000 non-null int64
dtypes: float64(2), int64(14), object(1)
memory usage: 13.0+ MB
None
-------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 1 columns):
fraud    100000 non-null int64
dtypes: int64(1)
memory usage: 781.3 KB
None


In [4]:
# Inputting the state missing value with the most common state
inputs['state1'] = inputs['state1'].fillna(inputs['state1'].mode()[0])

In [5]:
# Converting state values to integers
lbl = preprocessing.LabelEncoder()

lbl.fit(np.unique(list(inputs['state1'].values)))
inputs['state1'] = lbl.transform(list(inputs['state1'].values))

In [6]:
print(inputs.head())

   amount  hour1  state1  zip1  field1  field2  hour2  flag1  total  field3  \
0   12.95      0      50   986       0       0      0      0  12.95    -723   
1   38.85      0      50   980       3       1      0      0  38.85    5497   
2   38.85      0      19   402       2       1      0      0  38.85   -4420   
3   12.95      0       6   958       3       0      0      0  12.95    5010   
4   38.85      0      12   300       3       1      0      0  38.85   -4074   

   field4  indicator1  indicator2  flag2  flag3  flag4  flag5  
0      19           0           0      0      0      0      1  
1      14           1           0      0      1      0      1  
2      23           0           0      1      1      0      1  
3      31           0           0      1      0      0      1  
4      21           0           0      1      0      0      1  


In [7]:
# Convert our data frame to multidimensional arrays
# The X variable is an array of the independent variables and drops column 'y'
X = np.array(inputs)

# The y variable is an array of the dependent variable 'y'
y = np.array(target)

In [8]:
# Shuffle and partition our data into 80% train data and 20% test data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
# Choosing which model to use for our data
clf = RandomForestClassifier(n_estimators=50)
# clf = LogisticRegression()
# clf = tree.DecisionTreeClassifier()

# Training the data
clf.fit(X_train, y_train)

# Scoring the model
accuracy = clf.score(X_test, y_test)

# Printing the score of the model
print(accuracy)



0.98155


In [10]:
# Predicting the test data
y_pred = clf.predict(X_test)
y_test.resize(20000)

# Displaying a confusion matrix
df_confusion = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
print(df_confusion)

Predicted      0    1
Actual               
0          19411   60
1            309  220
