In [1]:
# Initial imports.
import pandas as pd
import numpy as np
from pandas import DataFrame
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
# Import csv
file_path = 'mock_covid_zipcodes.csv'
mock_zipcodes_df = pd.read_csv(file_path)
mock_zipcodes_df.head(10)

Unnamed: 0,hh,ii,jj,dates,ll,mm,nn
0,1,aaa,4,1/1/1900,9,6,2
1,2,bbb,3,1/2/1900,7,10,7
2,3,ccc,7,1/3/1900,6,8,5
3,4,ddd,9,1/4/1900,4,9,8
4,5,eee,10,1/5/1900,7,7,8
5,6,fff,8,1/6/1900,3,1,3
6,7,ggg,3,1/7/1900,1,5,3
7,8,hhh,4,1/8/1900,4,7,9
8,9,iii,3,1/9/1900,10,10,5
9,10,jjj,4,1/10/1900,10,9,2


In [3]:
file_path = 'mock_original_crimedata.csv'
mock_crimedata_df = pd.read_csv(file_path)
mock_crimedata_df.head(10)

Unnamed: 0,aa,bb,cc,dd,dates,ff,gg
0,1,10,aaa,3,1/1/1900,9,3
1,2,6,bbb,7,1/2/1900,2,1
2,3,3,ccc,9,1/3/1900,9,9
3,4,5,ddd,3,1/4/1900,10,2
4,5,9,eee,7,1/5/1900,8,1
5,6,5,fff,7,1/6/1900,2,3
6,7,7,ggg,10,1/7/1900,8,1
7,8,4,hhh,3,1/8/1900,1,1
8,9,8,iii,5,1/9/1900,2,10
9,10,7,jjj,1,1/10/1900,7,9


In [4]:
# Rename column hh to aa
mock_zipcodes_df.rename(columns = {'hh':'aa'}, inplace = True)
mock_zipcodes_df.head(10)

Unnamed: 0,aa,ii,jj,dates,ll,mm,nn
0,1,aaa,4,1/1/1900,9,6,2
1,2,bbb,3,1/2/1900,7,10,7
2,3,ccc,7,1/3/1900,6,8,5
3,4,ddd,9,1/4/1900,4,9,8
4,5,eee,10,1/5/1900,7,7,8
5,6,fff,8,1/6/1900,3,1,3
6,7,ggg,3,1/7/1900,1,5,3
7,8,hhh,4,1/8/1900,4,7,9
8,9,iii,3,1/9/1900,10,10,5
9,10,jjj,4,1/10/1900,10,9,2


In [5]:
# Merge dataframes
mock_df = pd.concat([mock_zipcodes_df.set_index('aa'),mock_crimedata_df.set_index('aa')], axis=1, join='inner')
mock_df.head(10)

Unnamed: 0_level_0,ii,jj,dates,ll,mm,nn,bb,cc,dd,dates,ff,gg
aa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,aaa,4,1/1/1900,9,6,2,10,aaa,3,1/1/1900,9,3
2,bbb,3,1/2/1900,7,10,7,6,bbb,7,1/2/1900,2,1
3,ccc,7,1/3/1900,6,8,5,3,ccc,9,1/3/1900,9,9
4,ddd,9,1/4/1900,4,9,8,5,ddd,3,1/4/1900,10,2
5,eee,10,1/5/1900,7,7,8,9,eee,7,1/5/1900,8,1
6,fff,8,1/6/1900,3,1,3,5,fff,7,1/6/1900,2,3
7,ggg,3,1/7/1900,1,5,3,7,ggg,10,1/7/1900,8,1
8,hhh,4,1/8/1900,4,7,9,4,hhh,3,1/8/1900,1,1
9,iii,3,1/9/1900,10,10,5,8,iii,5,1/9/1900,2,10
10,jjj,4,1/10/1900,10,9,2,7,jjj,1,1/10/1900,7,9


In [6]:
# Drop duplicate columns
mock_df=mock_df.T.drop_duplicates().T
mock_df.head(10)

Unnamed: 0_level_0,ii,jj,dates,ll,mm,nn,bb,dd,ff,gg
aa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,aaa,4,1/1/1900,9,6,2,10,3,9,3
2,bbb,3,1/2/1900,7,10,7,6,7,2,1
3,ccc,7,1/3/1900,6,8,5,3,9,9,9
4,ddd,9,1/4/1900,4,9,8,5,3,10,2
5,eee,10,1/5/1900,7,7,8,9,7,8,1
6,fff,8,1/6/1900,3,1,3,5,7,2,3
7,ggg,3,1/7/1900,1,5,3,7,10,8,1
8,hhh,4,1/8/1900,4,7,9,4,3,1,1
9,iii,3,1/9/1900,10,10,5,8,5,2,10
10,jjj,4,1/10/1900,10,9,2,7,1,7,9


In [7]:
# Change ii column from letters to numeric values
le = LabelEncoder()
mock_df2 = mock_df.copy()
mock_df2['ii'] = le.fit_transform(mock_df2['ii'])

In [8]:
# Print mock_df2
mock_df2.head()

Unnamed: 0_level_0,ii,jj,dates,ll,mm,nn,bb,dd,ff,gg
aa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,4,1/1/1900,9,6,2,10,3,9,3
2,1,3,1/2/1900,7,10,7,6,7,2,1
3,2,7,1/3/1900,6,8,5,3,9,9,9
4,3,9,1/4/1900,4,9,8,5,3,10,2
5,4,10,1/5/1900,7,7,8,9,7,8,1


In [9]:
# Determine data types
mock_df2.dtypes

ii        int64
jj       object
dates    object
ll       object
mm       object
nn       object
bb       object
dd       object
ff       object
gg       object
dtype: object

In [10]:
mock_df2[['jj', 'll', 'mm', 'nn', 'bb', 'dd', 'ff', 'gg', 'ff']].astype(int)

Unnamed: 0_level_0,jj,ll,mm,nn,bb,dd,ff,gg,ff
aa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,4,9,6,2,10,3,9,3,9
2,3,7,10,7,6,7,2,1,2
3,7,6,8,5,3,9,9,9,9
4,9,4,9,8,5,3,10,2,10
5,10,7,7,8,9,7,8,1,8
6,8,3,1,3,5,7,2,3,2
7,3,1,5,3,7,10,8,1,8
8,4,4,7,9,4,3,1,1,1
9,3,10,10,5,8,5,2,10,2
10,4,10,9,2,7,1,7,9,7


In [11]:
# Print dataframe
mock_df2

Unnamed: 0_level_0,ii,jj,dates,ll,mm,nn,bb,dd,ff,gg
aa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,4,1/1/1900,9,6,2,10,3,9,3
2,1,3,1/2/1900,7,10,7,6,7,2,1
3,2,7,1/3/1900,6,8,5,3,9,9,9
4,3,9,1/4/1900,4,9,8,5,3,10,2
5,4,10,1/5/1900,7,7,8,9,7,8,1
6,5,8,1/6/1900,3,1,3,5,7,2,3
7,6,3,1/7/1900,1,5,3,7,10,8,1
8,7,4,1/8/1900,4,7,9,4,3,1,1
9,8,3,1/9/1900,10,10,5,8,5,2,10
10,9,4,1/10/1900,10,9,2,7,1,7,9


In [12]:
# Drop columns thhat are not features or target.
mock_df2 = mock_df2.drop(columns=['jj','dates', 'll', 'mm', 'dd', 'ff'], axis=1)

In [13]:
# Print dataframe
mock_df2

Unnamed: 0_level_0,ii,nn,bb,gg
aa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,2,10,3
2,1,7,6,1
3,2,5,3,9
4,3,8,5,2
5,4,8,9,1
6,5,3,5,3
7,6,3,7,1
8,7,9,4,1
9,8,5,8,10
10,9,2,7,9


In [14]:
# Define the features set.
X = mock_df2.copy()
X = X.drop("bb", axis=1)
X.head()

Unnamed: 0_level_0,ii,nn,gg
aa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,2,3
2,1,7,1
3,2,5,9
4,3,8,2
5,4,8,1


In [15]:
# Define target set
y = mock_df2['bb'].values
y=y.astype('int')
y[:5]

array([10,  6,  3,  5,  9])

In [16]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split (X,y, random_state = 75, train_size=0.70)

In [17]:
# Create standard scaler and scale data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
# Create random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state = 75)

In [19]:
# Fit model.
rf_model = rf_model.fit(X_train_scaled, y_train)

In [20]:
# Make predictions.
predictions = rf_model.predict(X_test_scaled)
print(predictions)

[5 5 5]


In [21]:
# Calculate Confusion Matrix
cm = confusion_matrix(y_test, predictions)

In [22]:
# Calculate the accuracy 
accuracy_score = accuracy_score(y_test, predictions)

In [23]:
# Display results
print("Confusion Matrix")
print(f"Accuracy Score : {accuracy_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix
Accuracy Score : 0.0
Classification Report
              precision    recall  f1-score   support

           4       0.00      0.00      0.00       1.0
           5       0.00      0.00      0.00       0.0
           7       0.00      0.00      0.00       1.0
           9       0.00      0.00      0.00       1.0

    accuracy                           0.00       3.0
   macro avg       0.00      0.00      0.00       3.0
weighted avg       0.00      0.00      0.00       3.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
