In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier


In [3]:
# Base path to your data folder 
base_path = "/Users/machome/Downloads/archive (6)/"

# Load datasets 
train = pd.read_csv(base_path + "train.csv") 
test = pd.read_csv(base_path + "test.csv")


In [4]:
# Quick look at each dataset
print("Train shape:", train.shape)
print("Test shape:", test.shape)


train.head()

Train shape: (878049, 9)
Test shape: (884262, 7)


Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [5]:
# Function to display missing values summary for a dataframe
def missing_summary(df, name):
    print(f"\nMissing values in {name}:")
    missing = df.isna().sum()
    missing = missing[missing > 0]
    if len(missing) == 0:
        print("  No missing values.")
    else:
        print(missing)

# Check each dataset
missing_summary(train, "train")
missing_summary(test, "test")




Missing values in train:
  No missing values.

Missing values in test:
  No missing values.


In [6]:
# Target
y = train["Category"]

# Features (drop leakage columns)
X = train.drop(columns=["Category", "Descript", "Resolution"])

# Test features (keep Id for submission, drop from features)
X_test = test.drop(columns=["Id"])


In [7]:
# Convert Dates to datetime
X["Dates"] = pd.to_datetime(X["Dates"])
X_test["Dates"] = pd.to_datetime(X_test["Dates"])

# Extract useful time features
for df in [X, X_test]:
    df["Year"] = df["Dates"].dt.year
    df["Month"] = df["Dates"].dt.month
    df["Day"] = df["Dates"].dt.day
    df["Hour"] = df["Dates"].dt.hour

# Drop original Dates column
X = X.drop(columns=["Dates"])
X_test = X_test.drop(columns=["Dates"])


In [8]:
# Identify numeric and categorical columns
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

# Numeric preprocessing: fill missing values with median
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

# Categorical preprocessing: fill missing values and one-hot encode
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)


In [9]:
# Split the data into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)


Training set shape: (702439, 9)
Validation set shape: (175610, 9)


In [10]:
# Decision Tree classification model
decision_tree_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", DecisionTreeClassifier(random_state=42))
])

# Fit the Decision Tree model
decision_tree_model.fit(X_train, y_train)

# Predict on validation set
dt_val_preds = decision_tree_model.predict(X_val)

# Compute validation accuracy
dt_acc = accuracy_score(y_val, dt_val_preds)
print("Decision Tree Validation Accuracy:", dt_acc)


Decision Tree Validation Accuracy: 0.2595865839075224


In [11]:
# Random Forest classification model
random_forest_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(
        n_estimators=100,      # number of trees
        random_state=42,
        n_jobs=-1              # use all CPU cores
    ))
])

# Fit the Random Forest model
random_forest_model.fit(X_train, y_train)

# Predict on validation set
rf_val_preds = random_forest_model.predict(X_val)

# Compute validation accuracy
rf_acc = accuracy_score(y_val, rf_val_preds)
print("Random Forest Validation Accuracy:", rf_acc)


Random Forest Validation Accuracy: 0.2617675531006207


In [12]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# SVM classification model (linear SVM)
svm_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LinearSVC())
])

# Fit the SVM model on the full training set
svm_model.fit(X_train, y_train)

# Predict on validation set
svm_val_preds = svm_model.predict(X_val)

# Compute validation accuracy
svm_acc = accuracy_score(y_val, svm_val_preds)
print("SVM Validation Accuracy:", svm_acc)




SVM Validation Accuracy: 0.2778372530038153


In [13]:
# Fit models on full training data
decision_tree_model.fit(X, y)
random_forest_model.fit(X, y)
svm_model.fit(X, y)




0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [14]:
# Predict with each model
dt_test_preds = decision_tree_model.predict(X_test)
rf_test_preds = random_forest_model.predict(X_test)
svm_test_preds = svm_model.predict(X_test)


In [15]:
# Decision Tree submission
submission_dt = pd.DataFrame({
    "Id": test["Id"],
    "Category": dt_test_preds
})
submission_dt.to_csv("submission_decision_tree.csv", index=False)
print("Saved: submission_decision_tree.csv")

# Random Forest submission
submission_rf = pd.DataFrame({
    "Id": test["Id"],
    "Category": rf_test_preds
})
submission_rf.to_csv("submission_random_forest.csv", index=False)
print("Saved: submission_random_forest.csv")

# SVM submission
submission_svm = pd.DataFrame({
    "Id": test["Id"],
    "Category": svm_test_preds
})
submission_svm.to_csv("submission_svm.csv", index=False)
print("Saved: submission_svm.csv")


Saved: submission_decision_tree.csv
Saved: submission_random_forest.csv
Saved: submission_svm.csv
