# Imports

In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

from sklearn.model_selection import train_test_split
import xgboost as xgb

np.random.seed(1)

# Data Preprocessing

In [2]:

df=pd.read_csv('complaints_df.csv')
df['busTime'] = pd.to_datetime(df['busTime'])

# Initialize LabelEncoder
label_encoder = LabelEncoder()
# Delete rows where the specific column has the specific value
df = df[df['complainId'] != 'אחר']
# Reset the index of the dataframe
df = df.reset_index(drop=True)
# Encode the categorical column

df['isJewishHoliday'] = label_encoder.fit_transform(df['isJewishHoliday'])
df['operator'] = label_encoder.fit_transform(df['operator'])
df['accesible'] = label_encoder.fit_transform(df['accesible'])

In [3]:

# Convert 'busTime' column to datetime format
df['busTime'] = pd.to_datetime(df['busTime'], format='%d/%m/%Y %H:%M')

# Define the conditions for removal
condition1 = df['complainId'] == 'אוטובוס הגיע בזמן'
condition2 = (df['busTime'].dt.month >= 5) & (df['busTime'].dt.day >= 5)

# Combine both conditions using the '&' operator
combined_condition = condition1 & condition2

df=df.drop(['busTime'],axis=1)
imbalanced_df=df

# Drop rows that meet the combined condition
df = df[~combined_condition]
df['complainId'] = label_encoder.fit_transform(df['complainId'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['complainId'] = label_encoder.fit_transform(df['complainId'])


In [4]:
df

Unnamed: 0,busLine,complainId,stationId,isJewishHoliday,weekDay,relativeHumidity(%),temperature(Â°C),windSpeed(m/s),rainfall(mm),operator,routeLength,accesible,numberOfStations,weeklyDrives,numberOfLinesStoppingAtStation,dailyNumberOfPassengersAtStation,hour,passengersNumberSum,dailyNumberOfStopsAtStation
0,322,0,13983,0,3,53,17.0,2.7,0.0,4,41.2,1,45,25,39,274.541667,11,6,771
1,16,1,13983,0,3,53,17.0,2.7,0.0,2,41.2,1,45,25,39,151.291667,11,6,771
2,26,2,13992,0,3,48,18.9,3.6,0.0,4,47.8,1,35,33,34,151.291667,13,1,725
3,11,3,11292,0,4,93,8.4,1.4,0.0,2,19.7,0,47,243,39,293.979167,6,1,294
4,11,0,11292,0,4,93,8.4,1.4,0.0,2,19.7,0,47,243,39,293.979167,6,1,294
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,10,0,13992,0,4,50,25.3,3.6,0.0,2,11.9,1,19,168,34,151.291667,13,2,725
174,12,0,11342,0,5,50,25.3,3.6,0.0,2,8.8,0,26,491,8,110.541667,13,1,765
178,25,0,13992,0,4,50,25.3,3.6,0.0,2,16.9,0,44,434,34,151.291667,13,3,725
180,12,1,11132,0,5,39,15.2,2.0,0.0,2,19.7,0,47,243,8,80.958333,7,1,294


# Train Test Split

In [5]:

X = df.drop(['complainId'], axis=1)
y = df['complainId']

# Split the data into train and test sets, stratifying by the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.201, stratify=y, random_state=1)

# Modeling

## Decision Tree

In [6]:
# Create a Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(max_depth=3)

# Fit the classifier to the training data
dt_classifier.fit(X_train, y_train)

# Predict using the trained classifier
dt_predictions = dt_classifier.predict(X_test)

dt_accuracy = accuracy_score(y_test, dt_predictions)

## Random Forest

In [7]:
# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Predict using the trained classifier
rf_predictions = rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)

## XGBoost

In [8]:
# Create an XGBoost classifier
xgb_classifier = xgb.XGBClassifier(n_estimators=30,max_depth=7,learning_rate=0.1)

# Fit the classifier to the training data
xgb_classifier.fit(X_train, y_train)

# Predict using the trained classifier
xgb_predictions = xgb_classifier.predict(X_test)


xgb_accuracy = accuracy_score(y_test, xgb_predictions)

# Accuracy

In [9]:
print('Random Forest Accuracy: '+str(round(rf_accuracy,5)))

print('XGBoost Accuracy: '+str(round(xgb_accuracy,5)))

print('Decision Tree Accuracy: ' + str(round(dt_accuracy,5)))

Random Forest Accuracy: 0.8
XGBoost Accuracy: 0.84
Decision Tree Accuracy: 0.76


# Precision, Recall, F1-Score

In [10]:

imbalanced_df['complainId'] = imbalanced_df['complainId'].replace('אוטובוס הגיע בזמן', 1).replace(['.*'], 0, regex=True).astype(int)


X = imbalanced_df.drop(['complainId'], axis=1)
y = imbalanced_df['complainId']

# Split the data into train and test sets, stratifying by the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y, random_state=1)

## Decision Tree

In [11]:
# Create a Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(max_depth=4)

# Fit the classifier to the training data
dt_classifier.fit(X_train, y_train)

# Predict using the trained classifier
dt_predictions = dt_classifier.predict(X_test)

# Predict using the trained classifier
dt_precision = precision_score(y_test, dt_predictions)
dt_recall = recall_score(y_test, dt_predictions)
dt_f1 = f1_score(y_test, dt_predictions)

## Random Forest

In [12]:
# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=20)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Predict using the trained classifier
rf_predictions = rf_classifier.predict(X_test)
rf_precision = precision_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
rf_f1 = f1_score(y_test, rf_predictions)

## XGBoost

In [13]:
# Create an XGBoost classifier
xgb_classifier = xgb.XGBClassifier(n_estimators=10,max_depth=4,learning_rate=0.01)

# Fit the classifier to the training data
xgb_classifier.fit(X_train, y_train)

# Predict using the trained classifier
xgb_predictions = xgb_classifier.predict(X_test)


xgb_precision = precision_score(y_test, xgb_predictions)
xgb_recall = recall_score(y_test, xgb_predictions)
xgb_f1 = f1_score(y_test, xgb_predictions)

# Results

In [14]:
print('Random Forest Precision: '+str(round(rf_precision,5)))

print('XGBoost Precision: '+str(round(xgb_precision,5)))

print('Decision Tree Precision: ' + str(round(dt_precision,5)))

Random Forest Precision: 0.69231
XGBoost Precision: 0.72131
Decision Tree Precision: 0.72308


In [15]:
print('Random Forest Recall: '+str(round(rf_recall,5)))

print('XGBoost Recall: '+str(round(xgb_recall,5)))

print('Decision Tree Recall: ' + str(round(dt_recall,5)))

Random Forest Recall: 0.88235
XGBoost Recall: 0.86275
Decision Tree Recall: 0.92157


In [16]:
print('Random Forest F1-Score: '+str(round(rf_f1,5)))

print('XGBoost F1-Score: '+str(round(xgb_f1,5)))

print('Decision Tree F1-Score: ' + str(round(dt_f1,5)))

Random Forest F1-Score: 0.77586
XGBoost F1-Score: 0.78571
Decision Tree F1-Score: 0.81034
