In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('dataframe.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)

In [3]:
df.head()

Unnamed: 0,travelClass,bookingStatus,status1Day,status1Month,status1Week,status2Days,labels
0,3A,21,28,12,14,15,0.0
1,3A,14,63,-1,-1,-1,0.0
2,3A,39,-1,-1,-1,18,0.0
3,3A,11,46,8,-1,-1,0.0
4,3A,20,-1,-1,7,-1,0.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53381 entries, 0 to 53380
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   travelClass    53381 non-null  object 
 1   bookingStatus  53381 non-null  int64  
 2   status1Day     53381 non-null  int64  
 3   status1Month   53381 non-null  int64  
 4   status1Week    53381 non-null  int64  
 5   status2Days    53381 non-null  int64  
 6   labels         53381 non-null  float64
dtypes: float64(1), int64(5), object(1)
memory usage: 2.9+ MB


In [6]:
# df.columns
df["travelClass"].value_counts() * 100 / len(df)

travelClass
SL    47.426987
3A    36.546711
2A     9.838707
CC     3.962084
1A     1.212042
2S     1.013469
Name: count, dtype: float64

In [7]:
dt = {"SL": 0, "3A": 1, "2A": 2, "CC": 3, "1A": 4, "2S": 5}
df["travelClass"] = df["travelClass"].map(dt)

In [8]:
# Split df into features and target
X = df.drop(['labels', 'travelClass'], axis=1)  # Drop the original travelClass
y = df['labels']

In [9]:
from sklearn.model_selection import train_test_split

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)


In [10]:
# # use standard scaler to scale the features
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)


In [11]:
# Initialize and train the model
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [12]:
model = RandomForestClassifier(random_state=100, oob_score=True, n_estimators=25)
model.fit(X_train, y_train)
model.score(X_train, y_train)

0.9831397527163732

In [13]:
# Predict on the test set
predictions = model.predict(X_test)

# Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, predictions)}')
print(f'Precision: {precision_score(y_test, predictions)}')
print(f'Recall: {recall_score(y_test, predictions)}')
print(f'F1 Score: {f1_score(y_test, predictions)}')


Accuracy: 0.9571040554462864
Precision: 0.8796046720575023
Recall: 0.7513430544896393
F1 Score: 0.8104304635761589


In [23]:
def predict_new_data_point(model, new_data_point, dt):
    # Map the travelClass value
    new_data_point["travelClass"] = new_data_point["travelClass"].map(dt)
    
    # Ensure the new data point has the same feature columns as the training data
    new_data_point = new_data_point.drop(columns=['travelClass'])
    
    # I want prediction probabilities
    precision_score = model.predict_proba(new_data_point)
    
    predict = model.predict(new_data_point)

    return predict, precision_score

In [25]:
# Define the new data point
new_data_point = pd.DataFrame([['SL',16,-1,-1,-1,-1]], 
                              columns=['travelClass', 'bookingStatus', 'status1Day', 'status1Month', 'status1Week', 'status2Days'])

predict_new_data_point(model, new_data_point, dt)

(array([1.]), array([[0.02109629, 0.97890371]]))

In [26]:
import pickle, os

model_path = os.path.join(os.getcwd() + "/models")
os.makedirs(model_path, exist_ok=True)

model_dt = {"model": model, "dt": dt}
pickle.dump(model_dt, open(os.path.join(model_path, "models.pkl"), 'wb'))