# **Creating the per cluster models**

In [1]:
import pandas as pd
from datetime import datetime
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import itertools

train_df = pd.read_csv('train_data.csv')

# Convert timestamp to datetime and extract time features
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'], unit='s')
train_df['hour'] = train_df['timestamp'].dt.hour
train_df['day_of_week'] = train_df['timestamp'].dt.dayofweek

# Find unique IDs
unique_ids = train_df['id'].unique()

# Create a new DataFrame to store centroid locations for each user with time and day_of_week features
centroid_data = pd.DataFrame(columns=['id', 'centroid_latitude', 'centroid_longitude', 'centroid_hour', 'centroid_day_of_week'])

# Cluster Generation and Centroid Calculation for each user with time and day_of_week features
for user_id in unique_ids:
    user_data = train_df[train_df['id'] == user_id]

    # Extract relevant features
    features = ['latitude', 'longitude', 'hour', 'day_of_week']
    X = user_data[features]

    # Perform clustering (1 cluster)
    kmeans = KMeans(n_clusters=1, random_state=42, n_init=1)
    kmeans.fit(X)

    # Calculate centroid
    centroid = kmeans.cluster_centers_[0]
    centroid_hour = int(round(centroid[2]))

    # Append centroid data to the new DataFrame
    centroid_data = pd.concat([centroid_data, pd.DataFrame({
        'id': [user_id],
        'centroid_latitude': [centroid[0]],
        'centroid_longitude': [centroid[1]],
        'centroid_hour': [centroid_hour],
        'centroid_day_of_week': [int(round(centroid[3]))]
    })], ignore_index=True)

# Display the new DataFrame with centroid locations for each user including time and day_of_week
#print(centroid_data)

In [2]:
# Exclude 'id' column for clustering
X = centroid_data.drop('id', axis=1)

# Perform k-means clustering (5 clusters)
kmeans = KMeans(n_clusters=5, random_state=42, n_init=1)
kmeans.fit(X)

# Add cluster labels to the centroid_data DataFrame
centroid_data['cluster_label'] = kmeans.labels_

# Display the updated DataFrame with cluster labels
#print(centroid_data)

In [3]:
#import matplotlib.pyplot as plt

# Plotting the user ID clusters
#plt.figure(figsize=(8, 6))

#for cluster_label in centroid_data['cluster_label'].unique():
#    cluster = centroid_data[centroid_data['cluster_label'] == cluster_label]
#    plt.scatter(cluster['centroid_longitude'], cluster['centroid_latitude'], label=f'Cluster {cluster_label}')

#plt.xlabel('Longitude')
#plt.ylabel('Latitude')
#plt.title('Clustering of User IDs')
#plt.legend()
#plt.show()

In [4]:
# Calculate pairwise distances within each cluster
intracluster_distances = []
for cluster_label in centroid_data['cluster_label'].unique():
    cluster = centroid_data[centroid_data['cluster_label'] == cluster_label]
    points = cluster[['centroid_latitude', 'centroid_longitude', 'centroid_hour', 'centroid_day_of_week']]
    distances = pairwise_distances(points, metric='euclidean')
    mean_distance = distances.mean()
    intracluster_distances.append(mean_distance)

# Create a DataFrame to store cluster labels and their intracluster distances
cluster_distances = pd.DataFrame({
    'Cluster': centroid_data['cluster_label'].unique(),
    'Intracluster Distance': intracluster_distances
})

# Rank clusters based on intracluster distance (lower distance implies a better-defined cluster)
cluster_distances = cluster_distances.sort_values(by='Intracluster Distance')

# Display the ranked clusters based on intracluster distance
print(cluster_distances)

   Cluster  Intracluster Distance
3        2               0.000000
4        1               0.000000
1        0               0.214571
2        4               0.326788
0        3               0.867960


In [5]:
# Calculate pairwise distances between centroids of different clusters
intercluster_distances = []
cluster_labels = centroid_data['cluster_label'].unique()
for label1, label2 in itertools.combinations(cluster_labels, 2):
    cluster1 = centroid_data[centroid_data['cluster_label'] == label1]
    centroid1 = cluster1[['centroid_latitude', 'centroid_longitude', 'centroid_hour', 'centroid_day_of_week']].mean()

    cluster2 = centroid_data[centroid_data['cluster_label'] == label2]
    centroid2 = cluster2[['centroid_latitude', 'centroid_longitude', 'centroid_hour', 'centroid_day_of_week']].mean()

    distance = pairwise_distances([centroid1, centroid2], metric='euclidean')[0, 1]
    intercluster_distances.append({'Cluster 1': label1, 'Cluster 2': label2, 'Intercluster Distance': distance})

# Create a DataFrame to store intercluster distances
intercluster_distances_df = pd.DataFrame(intercluster_distances)

# Display the intercluster distances
print(intercluster_distances_df)

   Cluster 1  Cluster 2  Intercluster Distance
0          3          0               1.206195
1          3          4               2.223774
2          3          2               2.852917
3          3          1               6.675795
4          0          4               1.018757
5          0          2               4.000117
6          0          1               6.578408
7          4          2               5.002799
8          4          1               6.656044
9          2          1               7.688017


In [6]:
# Merge cluster labels from centroid_data to the original DataFrame 'train_df'
df = pd.merge(train_df, centroid_data[['id', 'cluster_label']], on='id', how='left')

# Display the updated DataFrame with cluster labels
#print(df)

In [7]:
from sklearn.ensemble import RandomForestRegressor

# Create an empty dictionary to store models
cluster_models = {}

# Loop through each cluster label
for cluster_label in df['cluster_label'].unique():
    # Select data for the current cluster label
    cluster_data = df[df['cluster_label'] == cluster_label]

    # Features and target
    X = cluster_data[['hour', 'day_of_week']]
    y = cluster_data[['latitude', 'longitude']]  # Both latitude and longitude as target variables

    # Train the RandomForestRegressor for each cluster
    cluster_model = RandomForestRegressor(n_estimators=100, random_state=42)
    cluster_model.fit(X, y)

    # Store the model corresponding to the cluster label in the dictionary
    cluster_models[cluster_label] = cluster_model

# Now 'cluster_models' dictionary contains trained models for each cluster label

In [8]:
# Ensure each cluster has its own model stored
for cluster_label, cluster_model in cluster_models.items():
    print(f"Cluster Label: {cluster_label}")
    print(f"Model: {cluster_model}")
    print("----------")

Cluster Label: 3
Model: RandomForestRegressor(random_state=42)
----------
Cluster Label: 0
Model: RandomForestRegressor(random_state=42)
----------
Cluster Label: 4
Model: RandomForestRegressor(random_state=42)
----------
Cluster Label: 2
Model: RandomForestRegressor(random_state=42)
----------
Cluster Label: 1
Model: RandomForestRegressor(random_state=42)
----------


# **Generating the test file with latitude and longitude predictions made from the per cluster models**

In [9]:

test_df = pd.read_csv('test_data_redacted.csv')

# Convert timestamp to datetime and extract time features
test_df['timestamp'] = pd.to_datetime(test_df['timestamp'], unit='s')
test_df['hour'] = test_df['timestamp'].dt.hour
test_df['day_of_week'] = test_df['timestamp'].dt.dayofweek

# Group test data with clusters based on user IDs
test_df['cluster_label'] = test_df['id'].map(centroid_data.set_index('id')['cluster_label'])

# Define a function to run the model for each cluster
def run_model(cluster_data):
    cluster_label = cluster_data['cluster_label'].iloc[0]
    if cluster_label in cluster_models:
        cluster_model = cluster_models[cluster_label]
        X_predict = cluster_data[['hour', 'day_of_week']]
        predicted_coordinates = cluster_model.predict(X_predict)
        cluster_data['predicted_latitude'] = predicted_coordinates[:, 0]
        cluster_data['predicted_longitude'] = predicted_coordinates[:, 1]
    return cluster_data

# Group by cluster label and apply the model to each cluster, preserving future behavior
updated_df = test_df.groupby('cluster_label', group_keys=False).apply(run_model)

# Display the updated DataFrame with predicted latitude and longitude
#print(updated_df)

predicted_coordinates_df = updated_df[['predicted_latitude', 'predicted_longitude']].copy()

original_test_df = pd.read_csv('test_data_redacted.csv')
per_cluster_preds = pd.concat([original_test_df, predicted_coordinates_df], axis=1)

per_cluster_preds.to_csv('/content/per_cluster_predictions.csv', index=False)
#print(per_cluster_preds)

# **Creating the per user models**

In [10]:
from sklearn.ensemble import RandomForestRegressor

# Create an empty dictionary to store models
user_models = {}

# Group data by 'id'
grouped = train_df.groupby('id')

# Iterate through each group (unique 'id')
for group_id, group_data in grouped:
    # Extract features and target variables
    X = group_data[['hour', 'day_of_week']]
    y = group_data[['latitude', 'longitude']]

    # Create a Random Forest regressor
    user_model = RandomForestRegressor(n_estimators=100, random_state=42)

    # Fit the model
    user_model.fit(X, y)

    # Store the model in the dictionary with 'id' as key
    user_models[group_id] = user_model

In [11]:
# Remove cluster_label from test_df as it's not needed anymore
test_df = test_df.drop(['cluster_label'], axis=1)


In [12]:
# Define a function to run the model for each ID
def run_model_by_id(id_data):
    id_value = id_data['id'].iloc[0]
    if id_value in user_models:
        user_model = user_models[id_value]
        X_predict = id_data[['hour', 'day_of_week']]
        predicted_coordinates = user_model.predict(X_predict)
        id_data['predicted_latitude'] = predicted_coordinates[:, 0]
        id_data['predicted_longitude'] = predicted_coordinates[:, 1]
    return id_data

# Group by ID and apply the model to each ID, preserving future behavior
updated_df = test_df.groupby('id', group_keys=False).apply(run_model_by_id)

# Display the updated DataFrame with predicted latitude and longitude
#print(updated_df)

# Create a DataFrame with predicted coordinates
predicted_coordinates_df = updated_df[['predicted_latitude', 'predicted_longitude']].copy()

# Merge predicted coordinates with the original test data
original_test_df1 = pd.read_csv('test_data_redacted.csv')
per_user_preds = pd.concat([original_test_df1, predicted_coordinates_df], axis=1)
per_user_preds.to_csv('/content/per_user_predictions.csv', index=False)