In [1]:
%pip install pyspark 

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load the dataset
df = pd.read_csv("/Users/muskan/Documents/SOEN 6111/BigDataProject/Rate/Rate_Cleaned.csv",index_col=0)

In [4]:
import numpy as np
# Assuming the dataset is very large, select the random 5,000 rows as a subset for analysis to manage memory and computational efficiency.
num_rows = df.shape[0]
random_rows = np.random.choice(num_rows, size=5000, replace=False)
df = df.iloc[random_rows, :]

In [5]:
# Define features and target for the machine learning model
features = ['BusinessYear', 'StateCode', 'SourceName', 'RatingAreaId', 'Tobacco', 'Age', 'PrimarySubscriberAndThreeOrMoreDependents','tobacco_rate']
target = 'PlanId'
# df.head()

In [6]:
# Initial DataFrame inspection and handling missing values
df = df.dropna()

In [7]:
print(df.shape)

(5000, 10)

In [8]:
from sklearn.preprocessing import LabelEncoder
# Label Encoding: Convert categorical text data into a model-usable numerical format
cols_to_encode = ['SourceName','StateCode','Tobacco','PlanId','RatingAreaId']

les = [LabelEncoder() for _ in range(len(cols_to_encode))]
for i,col in enumerate(cols_to_encode):
    df[col] = les[i].fit_transform(df[col])


In [9]:
# Split the dataset into training and testing sets to prepare for model training and evaluation. 80% of the data is used for training and 20% for testing
X_train, X_test, y_train, y_test= train_test_split(df[features], df[[target]], test_size=0.2, random_state=42)

In [10]:
X_train

Unnamed: 0,BusinessYear,StateCode,SourceName,RatingAreaId,Tobacco,Age,PrimarySubscriberAndThreeOrMoreDependents,tobacco_rate
4227,2014,0,0,0,0,33,0.0,357.00
4676,2014,0,0,2,0,31,0.0,462.00
800,2014,0,0,2,0,10,0.0,47.06
3671,2014,0,0,0,0,60,0.0,1159.58
4193,2014,0,0,0,0,27,0.0,373.98
...,...,...,...,...,...,...,...,...
4426,2014,0,0,1,0,27,0.0,417.75
466,2014,0,0,1,0,10,0.0,38.46
3092,2014,0,0,1,0,36,0.0,568.38
3772,2014,0,0,1,0,34,0.0,526.00


In [11]:
# Recommendation process based on cosine similarity
user_recs = []
for i,user_features in enumerate(X_test.iterrows()):
     # Calculate the cosine similarity between the current test instance and all training instances
    similarity_scores = np.array([cosine_similarity(user_features[1].values.reshape(1, -1), item[1].values.reshape(1, -1))[0] for t,item in enumerate(X_train.iterrows())])
    # Find the top 10 most similar items based on cosine similarity scores
    top_indices = np.argsort(similarity_scores.flatten())[-10:]
     # Store the indices of these items for recommendation
    user_recs.append({'user': i, 'recommendations': top_indices})


In [12]:
# Example: Get recommendations for a specific user and decode the recommended Plan IDs
user_id = 0
recommendations = user_recs[user_id]['recommendations']
print(f"Recommendations for user {user_id}:")
for i, plan_idx in enumerate(recommendations):
  # Decode Plan ID using LabelEncoder to get the original value
  print(les[3].inverse_transform(y_train.iloc[plan_idx]))
  # Additionally, showing the actual Plan ID of the user for comparison
  print(les[3].inverse_transform(y_test.iloc[user_id]))



Recommendations for user 0:
['73836AK0680011']
['73836AK0680004']
['73836AK0680015']
['73836AK0680004']
['73836AK0680011']
['73836AK0680004']
['73836AK0680007']
['73836AK0680004']
['73836AK0680007']
['73836AK0680004']
['73836AK0680011']
['73836AK0680004']
['73836AK0680011']
['73836AK0680004']
['73836AK0680004']
['73836AK0680004']
['73836AK0680004']
['73836AK0680004']
['73836AK0680004']
['73836AK0680004']


In [13]:
# Generate predictions for the test set based on the highest similarity score
y_pred = []
for user_id in range(len(X_test)):
  recommendations = user_recs[user_id]['recommendations']
  # Predict the last recommendation (highest similarity score)
  y_pred.append(les[3].inverse_transform(y_train.iloc[recommendations[-1]]))

In [14]:
# Evaluate the recommendation performance using F1 score
print(f1_score(les[3].inverse_transform(y_test),y_pred,average='weighted'))

0.7815498011097902


  y = column_or_1d(y, warn=True)


In [15]:
from sklearn.metrics import  classification_report

In [16]:
print(y_pred)

[array(['73836AK0680004'], dtype=object),
 array(['73836AK0680007'], dtype=object),
 array(['73836AK0680007'], dtype=object),
 array(['45858AK0030001'], dtype=object),
 array(['42507AK0020001'], dtype=object),
 array(['21989AK0010001'], dtype=object),
 array(['42507AK0020001'], dtype=object),
 array(['73836AK0630001'], dtype=object),
 array(['73836AK0680007'], dtype=object),
 array(['73836AK0680004'], dtype=object),
 array(['73836AK0680007'], dtype=object),
 array(['38536AK0010001'], dtype=object),
 array(['45858AK0040001'], dtype=object),
 array(['42507AK0010001'], dtype=object),
 array(['73836AK0680011'], dtype=object),
 array(['73836AK0680015'], dtype=object),
 array(['45858AK0040001'], dtype=object),
 array(['73836AK0680011'], dtype=object),
 array(['73836AK0680004'], dtype=object),
 array(['45858AK0040001'], dtype=object),
 array(['45858AK0030002'], dtype=object),
 array(['73836AK0680011'], dtype=object),
 array(['45858AK0030001'], dtype=object),
 array(['42507AK0020001'], dtype=o

In [17]:
print(y_test)

Unnamed: 0,PlanId
1501,22
2586,23
2653,23
1055,13
705,6
...,...
4711,26
2313,23
3214,24
2732,23


In [18]:
# Provide a detailed classification report
print(classification_report(les[3].inverse_transform(y_test), y_pred))

                precision    recall  f1-score   support

21989AK0010001       0.94      0.97      0.96        34
38536AK0010001       1.00      0.97      0.98        33
38536AK0010002       0.96      0.93      0.94        27
38536AK0010003       0.97      1.00      0.98        30
42507AK0010001       0.09      0.10      0.09        31
42507AK0020001       0.15      0.15      0.15        34
42507AK0030001       1.00      1.00      1.00         1
42507AK0040001       1.00      1.00      1.00         1
42507AK0060001       1.00      1.00      1.00         1
45858AK0030001       0.00      0.00      0.00        21
45858AK0030002       0.00      0.00      0.00        33
45858AK0040001       0.03      0.04      0.04        26
45858AK0040002       0.00      0.00      0.00        27
73836AK0620001       0.95      1.00      0.98        21
73836AK0630001       0.90      0.93      0.91        28
73836AK0630002       0.81      0.84      0.82        25
73836AK0640001       0.90      0.90      0.90  

  y = column_or_1d(y, warn=True)
