In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Define groups and their corresponding dummy numbers
frequent_groups = {
    'Family': 563219,
    'Friends': 986547,
    'Colleagues': 471825,
    'Clients': 837462,
    'Neighbors': 295184,
    'GymBuddies': 628731,
    'BookClub': 159246,
    'Cook': 485932,
    'Maid': 761934,
    'College1': 372185,
    'College2': 891354,
    'College3': 641782,
}

occasional_groups = {
    'Doctor': 279615,
    'Dentist': 465823,
    'Lawyer': 823179,
    'HairStylist': 681593,
    'Plumber': 591237,
    'Electrician': 358216,
    'Mechanic': 724815,
    'Teacher': 918234,
    'Tutor': 674189,
    'Landlord': 531962,
    'GroceryStore': 847263,
    'Bank': 219834,
    'PostOffice': 536194,
    'DeliveryService': 861294,
    'SecurityGuard': 497231,
    'Babysitter': 725163,
}

# Define caller locations and their associated groups
caller_locations = {
    'Home': ['Family', 'Friends', 'Cook', 'Maid'],
    'Office': ['Colleagues', 'Clients', 'Neighbors'],
    'Restaurant': ['Family', 'Friends', 'Cook', 'Clients'],
    'Gym': ['GymBuddies'],
    'Park': ['Family', 'Friends', 'Neighbors'],
    'Shopping Mall': ['Friends', 'Neighbors'],
    'School': ['College1', 'College2', 'College3'],
    'Airport': ['Clients'],
    'Library': ['College1', 'College2', 'College3'],
    'Hospital': ['Doctor', 'Dentist'],
}

# Define time of day categories
time_of_day_categories = {
    'Morning': ['Home', 'Office', 'Gym', 'Park', 'School'],
    'Early Morning': ['Home', 'Park'],
    'Afternoon': ['Office', 'Restaurant', 'Shopping Mall', 'Library'],
    'Mid Afternoon': ['Office', 'Shopping Mall', 'Library'],
    'Night': ['Home', 'Restaurant'],
}

# Define other parameters
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 12, 31)
caller_id = 'John Doe'
caller_number = '+919876543210'  # Assuming a fictional phone number
num_rows = 1000

# Generate call data
call_data = {
    'CallerID': [],
    'CallerNumber': [],
    'ReceiverID': [],
    'ReceiverNumber': [],
    'CallDate': [],
    'TimeOfDay': [],
    'CallDuration': [],
    'GroupID': [],
    'CallerLocation': [],
    'CallPurpose': [],
    'CallOutcome': [],
    'DayOfWeek': [],
}

# Define the frequency of frequent groups
frequent_groups_frequency = 12
occasional_groups_frequency = num_rows // len(frequent_groups)

for _ in range(num_rows):
    # Determine the caller's location and time of day
    caller_location = random.choice(list(caller_locations.keys()))
    time_of_day = random.choice(list(time_of_day_categories.keys()))
    
    # Choose the receiver's group based on the caller's location
    receiver_group = random.choice(caller_locations[caller_location])
    
    # Determine the receiver's number based on the selected group
    if receiver_group in frequent_groups:
        receiver_number = frequent_groups[receiver_group]
    else:
        receiver_number = occasional_groups[receiver_group]
    
    # Choose the receiver's ID randomly within the group
    receiver_id = random.choice([receiver_group + str(i) for i in range(1, 11)])

    call_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
    call_duration = random.randint(60, 600)  # Random duration between 1 minute and 10 minutes
    call_purpose = random.choice(['Meeting', 'Social', 'Business'])
    call_outcome = random.choice(['Connected', 'Missed', 'Voicemail'])
    day_of_week = call_date.strftime("%A")  # Get the day of the week

    call_data['CallerID'].append(caller_id)
    call_data['CallerNumber'].append(caller_number)
    call_data['ReceiverID'].append(receiver_id)
    call_data['ReceiverNumber'].append(receiver_number)
    call_data['CallDate'].append(call_date)
    call_data['TimeOfDay'].append(time_of_day)  # Add the TimeOfDay column
    call_data['CallDuration'].append(call_duration)
    call_data['GroupID'].append(receiver_number)
    call_data['CallerLocation'].append(caller_location)
    call_data['CallPurpose'].append(call_purpose)
    call_data['CallOutcome'].append(call_outcome)
    call_data['DayOfWeek'].append(day_of_week)

# Create DataFrame
df = pd.DataFrame(call_data)

# Display the DataFrame
print(df.head())


   CallerID   CallerNumber  ReceiverID  ReceiverNumber   CallDate  \
0  John Doe  +919876543210   College21          891354 2023-04-30   
1  John Doe  +919876543210    Clients1          837462 2023-05-12   
2  John Doe  +919876543210  Neighbors6          295184 2023-09-27   
3  John Doe  +919876543210     Family1          563219 2023-05-26   
4  John Doe  +919876543210  Neighbors3          295184 2023-09-29   

       TimeOfDay  CallDuration  GroupID CallerLocation CallPurpose  \
0  Early Morning           277   891354        Library      Social   
1          Night           331   837462        Airport      Social   
2          Night            64   295184  Shopping Mall      Social   
3  Mid Afternoon           138   563219           Home      Social   
4          Night           428   295184  Shopping Mall      Social   

  CallOutcome  DayOfWeek  
0      Missed     Sunday  
1      Missed     Friday  
2   Connected  Wednesday  
3      Missed     Friday  
4   Connected     Friday  


In [2]:
df.head()

Unnamed: 0,CallerID,CallerNumber,ReceiverID,ReceiverNumber,CallDate,TimeOfDay,CallDuration,GroupID,CallerLocation,CallPurpose,CallOutcome,DayOfWeek
0,John Doe,919876543210,College21,891354,2023-04-30,Early Morning,277,891354,Library,Social,Missed,Sunday
1,John Doe,919876543210,Clients1,837462,2023-05-12,Night,331,837462,Airport,Social,Missed,Friday
2,John Doe,919876543210,Neighbors6,295184,2023-09-27,Night,64,295184,Shopping Mall,Social,Connected,Wednesday
3,John Doe,919876543210,Family1,563219,2023-05-26,Mid Afternoon,138,563219,Home,Social,Missed,Friday
4,John Doe,919876543210,Neighbors3,295184,2023-09-29,Night,428,295184,Shopping Mall,Social,Connected,Friday


In [3]:
# Remove unnecessary columns for modeling
df_preprocessed = df.drop(['CallerID', 'CallerNumber', 'GroupID','CallOutcome'], axis=1)

# Display the preprocessed DataFrame
df_preprocessed.head()


Unnamed: 0,ReceiverID,ReceiverNumber,CallDate,TimeOfDay,CallDuration,CallerLocation,CallPurpose,DayOfWeek
0,College21,891354,2023-04-30,Early Morning,277,Library,Social,Sunday
1,Clients1,837462,2023-05-12,Night,331,Airport,Social,Friday
2,Neighbors6,295184,2023-09-27,Night,64,Shopping Mall,Social,Wednesday
3,Family1,563219,2023-05-26,Mid Afternoon,138,Home,Social,Friday
4,Neighbors3,295184,2023-09-29,Night,428,Shopping Mall,Social,Friday


In [4]:
df_preprocessed['Month'] = df_preprocessed['CallDate'].dt.month_name()


In [5]:
df_preprocessed.head()

Unnamed: 0,ReceiverID,ReceiverNumber,CallDate,TimeOfDay,CallDuration,CallerLocation,CallPurpose,DayOfWeek,Month
0,College21,891354,2023-04-30,Early Morning,277,Library,Social,Sunday,April
1,Clients1,837462,2023-05-12,Night,331,Airport,Social,Friday,May
2,Neighbors6,295184,2023-09-27,Night,64,Shopping Mall,Social,Wednesday,September
3,Family1,563219,2023-05-26,Mid Afternoon,138,Home,Social,Friday,May
4,Neighbors3,295184,2023-09-29,Night,428,Shopping Mall,Social,Friday,September


In [6]:
df_preprocessed = pd.get_dummies(df_preprocessed, columns=['TimeOfDay', 'DayOfWeek', 'Month'])


In [7]:
df_preprocessed

Unnamed: 0,ReceiverID,ReceiverNumber,CallDate,CallDuration,CallerLocation,CallPurpose,TimeOfDay_Afternoon,TimeOfDay_Early Morning,TimeOfDay_Mid Afternoon,TimeOfDay_Morning,...,Month_December,Month_February,Month_January,Month_July,Month_June,Month_March,Month_May,Month_November,Month_October,Month_September
0,College21,891354,2023-04-30,277,Library,Social,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Clients1,837462,2023-05-12,331,Airport,Social,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,Neighbors6,295184,2023-09-27,64,Shopping Mall,Social,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Family1,563219,2023-05-26,138,Home,Social,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,Neighbors3,295184,2023-09-29,428,Shopping Mall,Social,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Doctor3,279615,2023-08-24,505,Hospital,Meeting,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,GymBuddies8,628731,2023-09-16,76,Gym,Business,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
997,Cook4,485932,2023-05-29,67,Home,Business,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
998,GymBuddies6,628731,2023-07-10,332,Gym,Business,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0


In [8]:
df_preprocessed.shape

(1000, 30)

In [9]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'ReceiverID' column
df_preprocessed['ReceiverID'] = label_encoder.fit_transform(df['ReceiverID'])


In [10]:
df_preprocessed.head()

Unnamed: 0,ReceiverID,ReceiverNumber,CallDate,CallDuration,CallerLocation,CallPurpose,TimeOfDay_Afternoon,TimeOfDay_Early Morning,TimeOfDay_Mid Afternoon,TimeOfDay_Morning,...,Month_December,Month_February,Month_January,Month_July,Month_June,Month_March,Month_May,Month_November,Month_October,Month_September
0,30,891354,2023-04-30,277,Library,Social,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,837462,2023-05-12,331,Airport,Social,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,125,295184,2023-09-27,64,Shopping Mall,Social,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,80,563219,2023-05-26,138,Home,Social,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,122,295184,2023-09-29,428,Shopping Mall,Social,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [12]:
# Convert 'ReceiverID' and 'ReceiverNumber' columns to strings and then concatenate
df_preprocessed['EncodedReceiverID'] = df_preprocessed['ReceiverID'].astype(str) + '-' + df_preprocessed['ReceiverNumber'].astype(str)

# Display the DataFrame to verify
print(df_preprocessed.head())


   ReceiverID  ReceiverNumber   CallDate  CallDuration CallerLocation  \
0          30          891354 2023-04-30           277        Library   
1           0          837462 2023-05-12           331        Airport   
2         125          295184 2023-09-27            64  Shopping Mall   
3          80          563219 2023-05-26           138           Home   
4         122          295184 2023-09-29           428  Shopping Mall   

  CallPurpose  TimeOfDay_Afternoon  TimeOfDay_Early Morning  \
0      Social                    0                        1   
1      Social                    0                        0   
2      Social                    0                        0   
3      Social                    0                        0   
4      Social                    0                        0   

   TimeOfDay_Mid Afternoon  TimeOfDay_Morning  ...  Month_February  \
0                        0                  0  ...               0   
1                        0                

In [13]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the concatenated column
df_preprocessed['EncodedReceiverID'] = label_encoder.fit_transform(df_preprocessed['EncodedReceiverID'])

# Display the DataFrame to verify
print(df_preprocessed.head())


   ReceiverID  ReceiverNumber   CallDate  CallDuration CallerLocation  \
0          30          891354 2023-04-30           277        Library   
1           0          837462 2023-05-12           331        Airport   
2         125          295184 2023-09-27            64  Shopping Mall   
3          80          563219 2023-05-26           138           Home   
4         122          295184 2023-09-29           428  Shopping Mall   

  CallPurpose  TimeOfDay_Afternoon  TimeOfDay_Early Morning  \
0      Social                    0                        1   
1      Social                    0                        0   
2      Social                    0                        0   
3      Social                    0                        0   
4      Social                    0                        0   

   TimeOfDay_Mid Afternoon  TimeOfDay_Morning  ...  Month_February  \
0                        0                  0  ...               0   
1                        0                

In [14]:
df_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   ReceiverID               1000 non-null   int64         
 1   ReceiverNumber           1000 non-null   int64         
 2   CallDate                 1000 non-null   datetime64[ns]
 3   CallDuration             1000 non-null   int64         
 4   CallerLocation           1000 non-null   object        
 5   CallPurpose              1000 non-null   object        
 6   TimeOfDay_Afternoon      1000 non-null   uint8         
 7   TimeOfDay_Early Morning  1000 non-null   uint8         
 8   TimeOfDay_Mid Afternoon  1000 non-null   uint8         
 9   TimeOfDay_Morning        1000 non-null   uint8         
 10  TimeOfDay_Night          1000 non-null   uint8         
 11  DayOfWeek_Friday         1000 non-null   uint8         
 12  DayOfWeek_Monday         1000 non-n

In [15]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply LabelEncoder to 'CallerLocation' and 'CallPurpose' columns
df_preprocessed['EncodedCallerLocation'] = label_encoder.fit_transform(df_preprocessed['CallerLocation'])
df_preprocessed['EncodedCallPurpose'] = label_encoder.fit_transform(df_preprocessed['CallPurpose'])

# Display the DataFrame to verify
df_preprocessed.head()


Unnamed: 0,ReceiverID,ReceiverNumber,CallDate,CallDuration,CallerLocation,CallPurpose,TimeOfDay_Afternoon,TimeOfDay_Early Morning,TimeOfDay_Mid Afternoon,TimeOfDay_Morning,...,Month_July,Month_June,Month_March,Month_May,Month_November,Month_October,Month_September,EncodedReceiverID,EncodedCallerLocation,EncodedCallPurpose
0,30,891354,2023-04-30,277,Library,Social,0,1,0,0,...,0,0,0,0,0,0,0,53,4,2
1,0,837462,2023-05-12,331,Airport,Social,0,0,0,0,...,0,0,0,1,0,0,0,0,0,2
2,125,295184,2023-09-27,64,Shopping Mall,Social,0,0,0,0,...,0,0,0,0,0,0,1,30,9,2
3,80,563219,2023-05-26,138,Home,Social,0,0,1,0,...,0,0,0,1,0,0,0,108,2,2
4,122,295184,2023-09-29,428,Shopping Mall,Social,0,0,0,0,...,0,0,0,0,0,0,1,27,9,2


In [16]:
# Drop the unnecessary columns after label encoding
df_preprocessed.drop(['ReceiverID', 'ReceiverNumber', 'CallDate','CallDuration','CallerLocation','CallPurpose' ], axis=1, inplace=True)




In [17]:
df_preprocessed.head()

Unnamed: 0,TimeOfDay_Afternoon,TimeOfDay_Early Morning,TimeOfDay_Mid Afternoon,TimeOfDay_Morning,TimeOfDay_Night,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,...,Month_July,Month_June,Month_March,Month_May,Month_November,Month_October,Month_September,EncodedReceiverID,EncodedCallerLocation,EncodedCallPurpose
0,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,53,4,2
1,0,0,0,0,1,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,2
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,30,9,2
3,0,0,1,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,108,2,2
4,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,27,9,2


In [18]:
from sklearn.model_selection import train_test_split

# Split the preprocessed data into features (X) and target variable (y)
X = df_preprocessed.drop('EncodedReceiverID', axis=1)
y = df_preprocessed['EncodedReceiverID']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [19]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten

# Define the CNN model
cnn_model = Sequential()
cnn_model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Flatten())
cnn_model.add(Dense(100, activation='relu'))
cnn_model.add(Dense(1))

# Compile the model
cnn_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Fit the model
cnn_model.fit(X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1)), y_train, epochs=10, batch_size=32, verbose=1)

# Evaluate the model
cnn_loss, cnn_mae = cnn_model.evaluate(X_test.values.reshape((X_test.shape[0], X_test.shape[1], 1)), y_test)
print("CNN Mean Absolute Error:", cnn_mae)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CNN Mean Absolute Error: 36.436763763427734


In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Define and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict using the model
rf_predictions = rf_model.predict(X_test)

# Calculate Mean Absolute Error
rf_mae = mean_absolute_error(y_test, rf_predictions)
print("Random Forest Mean Absolute Error:", rf_mae)


Random Forest Mean Absolute Error: 24.005738333333333


In [23]:
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error

# Create a CatBoostRegressor instance
catboost_regressor = CatBoostRegressor(iterations=1000,  # Number of trees
                                       learning_rate=0.1,  # Learning rate
                                       depth=6)  # Depth of trees

# Define the number of folds for cross-validation
num_folds = 5

# Define a scorer for the cross-validation
scorer = make_scorer(mean_absolute_error)

# Perform k-fold cross-validation for the CatBoost model
catboost_scores = cross_val_score(catboost_regressor, X_train, y_train, cv=num_folds, scoring=scorer)

# Calculate the mean MAE for the CatBoost model
mean_catboost_mae = catboost_scores.mean()

print("Mean MAE for CatBoost model:", mean_catboost_mae)


0:	learn: 39.3263837	total: 57.7ms	remaining: 57.6s
1:	learn: 38.3035082	total: 58.5ms	remaining: 29.2s
2:	learn: 37.3821126	total: 58.9ms	remaining: 19.6s
3:	learn: 36.6581174	total: 59.2ms	remaining: 14.7s
4:	learn: 35.8499709	total: 59.5ms	remaining: 11.8s
5:	learn: 35.4065304	total: 59.9ms	remaining: 9.92s
6:	learn: 34.5943331	total: 60.3ms	remaining: 8.55s
7:	learn: 34.0677230	total: 60.6ms	remaining: 7.51s
8:	learn: 33.7735574	total: 60.9ms	remaining: 6.71s
9:	learn: 33.5738014	total: 61.1ms	remaining: 6.05s
10:	learn: 33.1975530	total: 61.4ms	remaining: 5.52s
11:	learn: 32.8932217	total: 61.7ms	remaining: 5.08s
12:	learn: 32.5643968	total: 62ms	remaining: 4.71s
13:	learn: 32.4030306	total: 62.4ms	remaining: 4.39s
14:	learn: 32.1462903	total: 62.7ms	remaining: 4.12s
15:	learn: 31.8898841	total: 63ms	remaining: 3.88s
16:	learn: 31.7251494	total: 63.4ms	remaining: 3.66s
17:	learn: 31.5527250	total: 63.8ms	remaining: 3.48s
18:	learn: 31.3411225	total: 64.1ms	remaining: 3.31s
19:	lea

728:	learn: 8.3937077	total: 239ms	remaining: 88.7ms
729:	learn: 8.3883267	total: 239ms	remaining: 88.4ms
730:	learn: 8.3808505	total: 239ms	remaining: 88.1ms
731:	learn: 8.3749680	total: 240ms	remaining: 87.7ms
732:	learn: 8.3694225	total: 240ms	remaining: 87.4ms
733:	learn: 8.3608025	total: 240ms	remaining: 87.1ms
734:	learn: 8.3530764	total: 241ms	remaining: 86.7ms
735:	learn: 8.3454943	total: 241ms	remaining: 86.4ms
736:	learn: 8.3382756	total: 241ms	remaining: 86ms
737:	learn: 8.3309490	total: 241ms	remaining: 85.6ms
738:	learn: 8.3211748	total: 241ms	remaining: 85.3ms
739:	learn: 8.3151986	total: 242ms	remaining: 85.2ms
740:	learn: 8.3068025	total: 243ms	remaining: 84.8ms
741:	learn: 8.2947778	total: 243ms	remaining: 84.4ms
742:	learn: 8.2862782	total: 243ms	remaining: 84.1ms
743:	learn: 8.2813831	total: 243ms	remaining: 83.7ms
744:	learn: 8.2691316	total: 244ms	remaining: 83.4ms
745:	learn: 8.2607658	total: 244ms	remaining: 83ms
746:	learn: 8.2559038	total: 244ms	remaining: 82.6

360:	learn: 13.4799668	total: 96.8ms	remaining: 171ms
361:	learn: 13.4619238	total: 97.1ms	remaining: 171ms
362:	learn: 13.4423760	total: 97.4ms	remaining: 171ms
363:	learn: 13.4178095	total: 97.7ms	remaining: 171ms
364:	learn: 13.3874989	total: 97.9ms	remaining: 170ms
365:	learn: 13.3598510	total: 98.2ms	remaining: 170ms
366:	learn: 13.3278190	total: 98.5ms	remaining: 170ms
367:	learn: 13.3112929	total: 98.8ms	remaining: 170ms
368:	learn: 13.2942116	total: 99.1ms	remaining: 169ms
369:	learn: 13.2783879	total: 99.3ms	remaining: 169ms
370:	learn: 13.2386952	total: 99.6ms	remaining: 169ms
371:	learn: 13.2142235	total: 99.8ms	remaining: 168ms
372:	learn: 13.1917281	total: 100ms	remaining: 168ms
373:	learn: 13.1681225	total: 100ms	remaining: 168ms
374:	learn: 13.1483675	total: 101ms	remaining: 168ms
375:	learn: 13.1208983	total: 101ms	remaining: 167ms
376:	learn: 13.0980431	total: 101ms	remaining: 167ms
377:	learn: 13.0732345	total: 101ms	remaining: 167ms
378:	learn: 13.0489073	total: 102m

16:	learn: 32.7871028	total: 4.76ms	remaining: 276ms
17:	learn: 32.6239796	total: 5.07ms	remaining: 276ms
18:	learn: 32.4847385	total: 5.33ms	remaining: 275ms
19:	learn: 32.3812871	total: 5.58ms	remaining: 273ms
20:	learn: 32.1440180	total: 5.97ms	remaining: 278ms
21:	learn: 31.9831250	total: 6.25ms	remaining: 278ms
22:	learn: 31.8219286	total: 6.46ms	remaining: 274ms
23:	learn: 31.7223194	total: 6.71ms	remaining: 273ms
24:	learn: 31.4694170	total: 7.06ms	remaining: 275ms
25:	learn: 31.4033209	total: 7.39ms	remaining: 277ms
26:	learn: 31.1991964	total: 7.61ms	remaining: 274ms
27:	learn: 31.1159532	total: 7.85ms	remaining: 273ms
28:	learn: 30.9925240	total: 8.19ms	remaining: 274ms
29:	learn: 30.8099815	total: 8.43ms	remaining: 273ms
30:	learn: 30.6707906	total: 8.71ms	remaining: 272ms
31:	learn: 30.5408552	total: 8.94ms	remaining: 270ms
32:	learn: 30.3860580	total: 9.22ms	remaining: 270ms
33:	learn: 30.2702303	total: 9.51ms	remaining: 270ms
34:	learn: 30.1596687	total: 9.73ms	remaining:

674:	learn: 8.2830240	total: 188ms	remaining: 90.4ms
675:	learn: 8.2766732	total: 188ms	remaining: 90.3ms
676:	learn: 8.2691911	total: 189ms	remaining: 90ms
677:	learn: 8.2531236	total: 189ms	remaining: 89.8ms
678:	learn: 8.2432922	total: 189ms	remaining: 89.6ms
679:	learn: 8.2275137	total: 190ms	remaining: 89.3ms
680:	learn: 8.2172622	total: 190ms	remaining: 89ms
681:	learn: 8.2051580	total: 191ms	remaining: 88.8ms
682:	learn: 8.1981834	total: 191ms	remaining: 88.6ms
683:	learn: 8.1885365	total: 191ms	remaining: 88.3ms
684:	learn: 8.1784724	total: 192ms	remaining: 88.1ms
685:	learn: 8.1633263	total: 192ms	remaining: 87.8ms
686:	learn: 8.1532951	total: 192ms	remaining: 87.5ms
687:	learn: 8.1434063	total: 192ms	remaining: 87.3ms
688:	learn: 8.1327384	total: 193ms	remaining: 87ms
689:	learn: 8.1258734	total: 193ms	remaining: 86.8ms
690:	learn: 8.1145070	total: 193ms	remaining: 86.5ms
691:	learn: 8.1027428	total: 194ms	remaining: 86.2ms
692:	learn: 8.0870749	total: 194ms	remaining: 86ms
6

242:	learn: 16.7730030	total: 67.4ms	remaining: 210ms
243:	learn: 16.7360714	total: 67.9ms	remaining: 210ms
244:	learn: 16.6937604	total: 68.2ms	remaining: 210ms
245:	learn: 16.6636252	total: 68.4ms	remaining: 210ms
246:	learn: 16.6369575	total: 68.7ms	remaining: 209ms
247:	learn: 16.5974156	total: 68.9ms	remaining: 209ms
248:	learn: 16.5636314	total: 69.3ms	remaining: 209ms
249:	learn: 16.5376426	total: 69.6ms	remaining: 209ms
250:	learn: 16.5070109	total: 69.9ms	remaining: 209ms
251:	learn: 16.4725261	total: 70.2ms	remaining: 208ms
252:	learn: 16.4511363	total: 70.5ms	remaining: 208ms
253:	learn: 16.4070271	total: 70.7ms	remaining: 208ms
254:	learn: 16.3811120	total: 71ms	remaining: 207ms
255:	learn: 16.3544952	total: 71.3ms	remaining: 207ms
256:	learn: 16.3252257	total: 71.6ms	remaining: 207ms
257:	learn: 16.3011190	total: 71.9ms	remaining: 207ms
258:	learn: 16.2583480	total: 72.2ms	remaining: 206ms
259:	learn: 16.2096431	total: 72.5ms	remaining: 206ms
260:	learn: 16.1765762	total: 

865:	learn: 7.5823090	total: 251ms	remaining: 38.9ms
866:	learn: 7.5766842	total: 252ms	remaining: 38.6ms
867:	learn: 7.5729228	total: 252ms	remaining: 38.3ms
868:	learn: 7.5685600	total: 252ms	remaining: 38ms
869:	learn: 7.5629273	total: 252ms	remaining: 37.7ms
870:	learn: 7.5582172	total: 253ms	remaining: 37.4ms
871:	learn: 7.5522809	total: 253ms	remaining: 37.1ms
872:	learn: 7.5442788	total: 253ms	remaining: 36.8ms
873:	learn: 7.5399152	total: 253ms	remaining: 36.5ms
874:	learn: 7.5336751	total: 254ms	remaining: 36.2ms
875:	learn: 7.5229675	total: 254ms	remaining: 35.9ms
876:	learn: 7.5172889	total: 254ms	remaining: 35.7ms
877:	learn: 7.5123881	total: 255ms	remaining: 35.4ms
878:	learn: 7.5078420	total: 255ms	remaining: 35.1ms
879:	learn: 7.5019142	total: 255ms	remaining: 34.8ms
880:	learn: 7.4975781	total: 255ms	remaining: 34.5ms
881:	learn: 7.4926710	total: 256ms	remaining: 34.2ms
882:	learn: 7.4851571	total: 256ms	remaining: 33.9ms
883:	learn: 7.4813782	total: 256ms	remaining: 33

481:	learn: 11.0887486	total: 133ms	remaining: 143ms
482:	learn: 11.0730786	total: 134ms	remaining: 143ms
483:	learn: 11.0613445	total: 134ms	remaining: 143ms
484:	learn: 11.0512785	total: 134ms	remaining: 142ms
485:	learn: 11.0308934	total: 134ms	remaining: 142ms
486:	learn: 11.0090125	total: 135ms	remaining: 142ms
487:	learn: 10.9884935	total: 135ms	remaining: 142ms
488:	learn: 10.9641562	total: 135ms	remaining: 141ms
489:	learn: 10.9439393	total: 135ms	remaining: 141ms
490:	learn: 10.9341809	total: 136ms	remaining: 141ms
491:	learn: 10.9209505	total: 136ms	remaining: 140ms
492:	learn: 10.9035852	total: 136ms	remaining: 140ms
493:	learn: 10.8880756	total: 137ms	remaining: 140ms
494:	learn: 10.8696661	total: 137ms	remaining: 140ms
495:	learn: 10.8530225	total: 137ms	remaining: 139ms
496:	learn: 10.8409521	total: 137ms	remaining: 139ms
497:	learn: 10.8187385	total: 138ms	remaining: 139ms
498:	learn: 10.7999541	total: 138ms	remaining: 138ms
499:	learn: 10.7861262	total: 138ms	remaining:

In [24]:
print("Mean MAE for CatBoost model:", mean_catboost_mae)


Mean MAE for CatBoost model: 27.255679946660102


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
rf_predictions = rf_model.predict(X_test)

# Evaluate the model
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_rmse = mean_squared_error(y_test, rf_predictions, squared=False)
rf_r2 = r2_score(y_test, rf_predictions)

# Print the evaluation metrics
print("Random Forest Mean Absolute Error (MAE):", rf_mae)
print("Random Forest Mean Squared Error (MSE):", rf_mse)
print("Random Forest Root Mean Squared Error (RMSE):", rf_rmse)
print("Random Forest R-squared (R2) Score:", rf_r2)


Random Forest Mean Absolute Error (MAE): 24.005738333333333
Random Forest Mean Squared Error (MSE): 1231.0504121983333
Random Forest Root Mean Squared Error (RMSE): 35.086327995365
Random Forest R-squared (R2) Score: 0.28403225119520814
