# Import Packages & Create Dataframes

In [1]:
#import packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
#import behaviors file
behaviors = pd.read_csv(r"C:\Users\josia\Desktop\Capstone_Project\MINDlarge_train\behaviors.tsv", sep='\t')

In [3]:
#convert behavior data to dataframe
b_df = pd.DataFrame(behaviors)

# Examine the behavior data

In [4]:
b_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2232747 entries, 0 to 2232746
Data columns (total 5 columns):
 #   Column                                                                                                                                                                     Dtype 
---  ------                                                                                                                                                                     ----- 
 0   1                                                                                                                                                                          int64 
 1   U87243                                                                                                                                                                     object
 2   11/10/2019 11:30:54 AM                                                                                                                                 

In [5]:
print(b_df.head())

   1   U87243  11/10/2019 11:30:54 AM  \
0  2  U598644   11/12/2019 1:45:29 PM   
1  3  U532401  11/13/2019 11:23:03 AM   
2  4  U593596  11/12/2019 12:24:09 PM   
3  5  U239687   11/14/2019 8:03:01 PM   
4  6  U521853  11/11/2019 10:47:31 AM   

  N8668 N39081 N65259 N79529 N73408 N43615 N29379 N32031 N110232 N101921 N12614 N129591 N105760 N60457 N1229 N64932  \
0  N56056 N8726 N70353 N67998 N83823 N111108 N107...                                                                  
1  N128643 N87446 N122948 N9375 N82348 N129412 N5...                                                                  
2  N31043 N39592 N4104 N8223 N114581 N92747 N1207...                                                                  
3  N65250 N122359 N71723 N53796 N41663 N41484 N11...                                                                  
4  N8668 N29136 N128643 N9740 N9375 N52911 N12090...                                                                  

  N78206-0 N26368-0 N7578-0 N58592-0 N

In [6]:
#Create column names based on dataset legend found elsewhere
b_column_names = ['Impression_ID', 'User_ID', 'Time', 'History', 'Impressions']
b_df.columns = b_column_names

# Take Sample

In [7]:
b_df = b_df.sample(n=1000, random_state=1)

# Data Legend & Next Steps

# Wrangle the Behavior data

In [10]:
#Drop 'Impression_ID' and 'Time' columns as they do not pertain to my use case
columns_to_drop = ['Impression_ID', 'Time']
b_df = b_df.drop(columns = columns_to_drop)
print(b_df)

         User_ID                                            History  \
1837023  U591810                                N91352 N4555 N88075   
1328700   U21881  N1128 N79529 N45491 N79909 N104091 N53933 N852...   
1645076   U71066  N81496 N116312 N20359 N29595 N104200 N57016 N3...   
2155197  U455452  N109183 N63078 N91173 N45756 N122766 N9979 N82...   
527583   U240645               N127644 N40915 N27226 N120031 N27763   
...          ...                                                ...   
624522   U189640  N118998 N9375 N44759 N85102 N111415 N120908 N1...   
2175868  U408841  N22950 N67127 N97282 N105630 N19384 N108828 N2...   
256336   U339536  N34486 N106464 N125587 N80256 N31043 N86634 N9...   
2227092  U291734  N128643 N85484 N63078 N26935 N9375 N5060 N7257...   
741124   U660599  N7154 N87105 N109183 N121551 N33446 N48215 N94...   

                                               Impressions  
1837023  N67123-0 N100343-0 N23823-0 N8719-0 N11682-0 N...  
1328700  N8599-0 N8236-0 

In [11]:
#Group data by User ID
b_df_g = b_df.sort_values(by = 'User_ID')

In [12]:
#Check for missing values
b_df_g_missing = b_df_g.isnull().sum()
print(b_df_g_missing)

User_ID         0
History        24
Impressions     0
dtype: int64


In [13]:
#Examine type of missing value
b_df_g_hist_miss = b_df_g['History'].isnull()

In [14]:
print(b_df_g_hist_miss)

746803     False
1788588    False
1044349    False
840718     False
1719838    False
           ...  
1095245    False
2134768    False
1927012    False
349753     False
552070     False
Name: History, Length: 1000, dtype: bool


In [15]:
#Drop all rows with missing value
b_df_g_drop = b_df_g.dropna()
print(b_df_g_drop.isnull().sum())

User_ID        0
History        0
Impressions    0
dtype: int64


In [16]:
#Check to see how many remaining rows with data there are
print(b_df_g_drop.shape[0])

976


In [18]:
#rename data frame
b_df = b_df_g_drop

In [19]:
b_df.dtypes

User_ID        object
History        object
Impressions    object
dtype: object

In [20]:
#group data by User_ID
grouped_df = b_df.groupby('User_ID').agg({
    'History': lambda x: ','.join(x),  # Combine all 'History' values into one string
    'Impressions': lambda x: ','.join(x)  # Combine all 'Impressions' values into one string
}).reset_index()

# Step 2: Remove duplicates within each combined string
grouped_df['History'] = grouped_df['History'].apply(lambda x: ','.join(set(x.split(','))))
grouped_df['Impressions'] = grouped_df['Impressions'].apply(lambda x: ','.join(set(x.split(','))))

print(grouped_df)

     User_ID                                            History  \
0    U100159                                             N59700   
1    U100253  N94639 N104181 N6612 N86208 N1229 N10000 N6317...   
2    U100346  N90185 N61319 N15797 N65119 N94639 N52595 N769...   
3    U100602  N67127 N31305 N63723 N75655 N82474 N105205 N71...   
4    U102239         N128965 N90630 N50274 N123400 N6930 N91597   
..       ...                                                ...   
970   U94628  N128965 N31043 N9375 N95100 N113932 N106155 N1...   
971   U95270  N27587 N15899 N93049 N72590 N71977 N91597 N915...   
972   U95990  N127337 N29265 N80126 N79285 N10766 N65299 N73...   
973   U96367  N15833 N52075 N47788 N19175 N22633 N112687 N10...   
974   U96536  N128965 N31043 N9375 N12326 N94639 N58992 N769...   

                                           Impressions  
0    N2750-0 N59301-0 N60135-0 N116647-0 N92498-1 N...  
1    N123784-0 N88472-0 N100846-0 N72609-1 N46229-0...  
2    N128940-0 N125953-0

In [21]:
grouped_df.dtypes

User_ID        object
History        object
Impressions    object
dtype: object

In [22]:
# Function to add "-1" to each ID and join back with spaces
def add_suffix_to_ids(ids_string):
    ids_list = ids_string.split()  # Split the string of IDs into a list
    ids_list_with_suffix = [id + '-1' for id in ids_list]  # Add "-1" to each ID
    return ' '.join(ids_list_with_suffix)  # Join the modified IDs back with spaces

# Apply the function to the 'History' column
grouped_df['History'] = grouped_df['History'].apply(add_suffix_to_ids)

# Display the updated DataFrame
print(grouped_df)

     User_ID                                            History  \
0    U100159                                           N59700-1   
1    U100253  N94639-1 N104181-1 N6612-1 N86208-1 N1229-1 N1...   
2    U100346  N90185-1 N61319-1 N15797-1 N65119-1 N94639-1 N...   
3    U100602  N67127-1 N31305-1 N63723-1 N75655-1 N82474-1 N...   
4    U102239  N128965-1 N90630-1 N50274-1 N123400-1 N6930-1 ...   
..       ...                                                ...   
970   U94628  N128965-1 N31043-1 N9375-1 N95100-1 N113932-1 ...   
971   U95270  N27587-1 N15899-1 N93049-1 N72590-1 N71977-1 N...   
972   U95990  N127337-1 N29265-1 N80126-1 N79285-1 N10766-1 ...   
973   U96367  N15833-1 N52075-1 N47788-1 N19175-1 N22633-1 N...   
974   U96536  N128965-1 N31043-1 N9375-1 N12326-1 N94639-1 N...   

                                           Impressions  
0    N2750-0 N59301-0 N60135-0 N116647-0 N92498-1 N...  
1    N123784-0 N88472-0 N100846-0 N72609-1 N46229-0...  
2    N128940-0 N125953-0

In [23]:
#combine the history and impressions columns together
def combine_history_impressions(row):
    history_list = row['History'].split()  # Split 'History' string into a list of IDs
    impressions_list = row['Impressions'].split()  # Split 'Impressions' string into a list of IDs
    combined_list = list(set(history_list + impressions_list))  # Combine and remove duplicates
    combined_string = ' '.join(combined_list)  # Join the combined list back into a string
    return combined_string

# Apply the function to create a new 'Combined' column
grouped_df['Combined'] = grouped_df.apply(combine_history_impressions, axis=1)

# Display the updated DataFrame
print(grouped_df)

     User_ID                                            History  \
0    U100159                                           N59700-1   
1    U100253  N94639-1 N104181-1 N6612-1 N86208-1 N1229-1 N1...   
2    U100346  N90185-1 N61319-1 N15797-1 N65119-1 N94639-1 N...   
3    U100602  N67127-1 N31305-1 N63723-1 N75655-1 N82474-1 N...   
4    U102239  N128965-1 N90630-1 N50274-1 N123400-1 N6930-1 ...   
..       ...                                                ...   
970   U94628  N128965-1 N31043-1 N9375-1 N95100-1 N113932-1 ...   
971   U95270  N27587-1 N15899-1 N93049-1 N72590-1 N71977-1 N...   
972   U95990  N127337-1 N29265-1 N80126-1 N79285-1 N10766-1 ...   
973   U96367  N15833-1 N52075-1 N47788-1 N19175-1 N22633-1 N...   
974   U96536  N128965-1 N31043-1 N9375-1 N12326-1 N94639-1 N...   

                                           Impressions  \
0    N2750-0 N59301-0 N60135-0 N116647-0 N92498-1 N...   
1    N123784-0 N88472-0 N100846-0 N72609-1 N46229-0...   
2    N128940-0 N12595

In [24]:
#drop impressions and history columns
column_list = ['Impressions', 'History']
grouped_df.drop(columns = column_list, inplace=True)

In [25]:
print(grouped_df)

     User_ID                                           Combined
0    U100159  N118883-0 N60135-0 N100073-0 N113126-0 N116647...
1    U100253  N82405-1 N106659-0 N19460-0 N129969-0 N6930-1 ...
2    U100346  N25967-0 N49791-0 N90185-1 N30119-0 N83421-0 N...
3    U100602  N22589-1 N116694-0 N35189-1 N102443-1 N33378-1...
4    U102239  N6930-1 N90630-1 N31879-0 N47026-0 N55792-0 N1...
..       ...                                                ...
970   U94628  N25967-0 N49791-0 N113932-1 N37379-0 N122561-1...
971   U95270  N72590-1 N15899-1 N93049-1 N110572-1 N91597-1 ...
972   U95990  N61321-1 N72590-1 N117074-1 N80126-1 N33037-0 ...
973   U96367  N120919-0 N47788-1 N35189-0 N96356-0 N118259-0...
974   U96536  N67319-1 N76970-1 N129412-1 N39948-1 N425-0 N5...

[975 rows x 2 columns]


In [26]:
#renamed combined column
grouped_df.rename(columns={'Combined': 'Impressions'}, inplace=True)
print(grouped_df)

     User_ID                                        Impressions
0    U100159  N118883-0 N60135-0 N100073-0 N113126-0 N116647...
1    U100253  N82405-1 N106659-0 N19460-0 N129969-0 N6930-1 ...
2    U100346  N25967-0 N49791-0 N90185-1 N30119-0 N83421-0 N...
3    U100602  N22589-1 N116694-0 N35189-1 N102443-1 N33378-1...
4    U102239  N6930-1 N90630-1 N31879-0 N47026-0 N55792-0 N1...
..       ...                                                ...
970   U94628  N25967-0 N49791-0 N113932-1 N37379-0 N122561-1...
971   U95270  N72590-1 N15899-1 N93049-1 N110572-1 N91597-1 ...
972   U95990  N61321-1 N72590-1 N117074-1 N80126-1 N33037-0 ...
973   U96367  N120919-0 N47788-1 N35189-0 N96356-0 N118259-0...
974   U96536  N67319-1 N76970-1 N129412-1 N39948-1 N425-0 N5...

[975 rows x 2 columns]


In [27]:
#rename df back to b_df
b_df = grouped_df

In [28]:
#Check for missing values
b_df.isnull().sum()

User_ID        0
Impressions    0
dtype: int64

In [29]:
#Check for duplicates
b_df.duplicated().sum()

0

# Prepare for User-Item Matrix

In [30]:
#reassign dataframe
b_df_final = b_df

# Define batch size
batch_size = 1000  # Adjust according to memory constraints

# Process DataFrame in chunks
chunks = [b_df_final[i:i+batch_size] for i in range(0, len(b_df_final), batch_size)]
expanded_dfs = []

for chunk in chunks:
    expanded_df = chunk.assign(Impression_ID=chunk['Impressions'].str.split()).explode('Impression_ID')
    expanded_df.rename(columns={'User_ID': 'User_ID', 'Impression_ID': 'Impression_ID'}, inplace=True)
    expanded_dfs.append(expanded_df)

# Concatenate all expanded DataFrames
b_df_final = pd.concat(expanded_dfs, ignore_index=True)

print(b_df_final)


       User_ID                                        Impressions  \
0      U100159  N118883-0 N60135-0 N100073-0 N113126-0 N116647...   
1      U100159  N118883-0 N60135-0 N100073-0 N113126-0 N116647...   
2      U100159  N118883-0 N60135-0 N100073-0 N113126-0 N116647...   
3      U100159  N118883-0 N60135-0 N100073-0 N113126-0 N116647...   
4      U100159  N118883-0 N60135-0 N100073-0 N113126-0 N116647...   
...        ...                                                ...   
70658   U96536  N67319-1 N76970-1 N129412-1 N39948-1 N425-0 N5...   
70659   U96536  N67319-1 N76970-1 N129412-1 N39948-1 N425-0 N5...   
70660   U96536  N67319-1 N76970-1 N129412-1 N39948-1 N425-0 N5...   
70661   U96536  N67319-1 N76970-1 N129412-1 N39948-1 N425-0 N5...   
70662   U96536  N67319-1 N76970-1 N129412-1 N39948-1 N425-0 N5...   

      Impression_ID  
0         N118883-0  
1          N60135-0  
2         N100073-0  
3         N113126-0  
4         N116647-0  
...             ...  
70658      N91597

In [31]:
b_df_final.drop(['Impressions'], axis=1, inplace = True)

In [32]:
print(b_df_final)

       User_ID Impression_ID
0      U100159     N118883-0
1      U100159      N60135-0
2      U100159     N100073-0
3      U100159     N113126-0
4      U100159     N116647-0
...        ...           ...
70658   U96536      N91597-1
70659   U96536       N1713-1
70660   U96536      N59700-1
70661   U96536     N117709-1
70662   U96536      N36861-0

[70663 rows x 2 columns]


In [33]:
b_df_final.head(10000).tail()

Unnamed: 0,User_ID,Impression_ID
9995,U189932,N38938-0
9996,U189932,N96616-1
9997,U189932,N71665-1
9998,U189932,N73222-1
9999,U189932,N124876-0


In [34]:
b_df_final['Interaction'] = b_df_final['Impression_ID'].str[-1]

In [35]:
print(b_df_final)

       User_ID Impression_ID Interaction
0      U100159     N118883-0           0
1      U100159      N60135-0           0
2      U100159     N100073-0           0
3      U100159     N113126-0           0
4      U100159     N116647-0           0
...        ...           ...         ...
70658   U96536      N91597-1           1
70659   U96536       N1713-1           1
70660   U96536      N59700-1           1
70661   U96536     N117709-1           1
70662   U96536      N36861-0           0

[70663 rows x 3 columns]


In [36]:
b_df_final['Article_ID'] = b_df_final['Impression_ID'].str[:-2]

In [37]:
print(b_df_final)

       User_ID Impression_ID Interaction Article_ID
0      U100159     N118883-0           0    N118883
1      U100159      N60135-0           0     N60135
2      U100159     N100073-0           0    N100073
3      U100159     N113126-0           0    N113126
4      U100159     N116647-0           0    N116647
...        ...           ...         ...        ...
70658   U96536      N91597-1           1     N91597
70659   U96536       N1713-1           1      N1713
70660   U96536      N59700-1           1     N59700
70661   U96536     N117709-1           1    N117709
70662   U96536      N36861-0           0     N36861

[70663 rows x 4 columns]


In [38]:
b_df_final.drop(columns = ['Impression_ID'])

Unnamed: 0,User_ID,Interaction,Article_ID
0,U100159,0,N118883
1,U100159,0,N60135
2,U100159,0,N100073
3,U100159,0,N113126
4,U100159,0,N116647
...,...,...,...
70658,U96536,1,N91597
70659,U96536,1,N1713
70660,U96536,1,N59700
70661,U96536,1,N117709


In [39]:
new_order = ['User_ID', 'Article_ID', 'Interaction']

# Reorder the DataFrame columns
b_df_final = b_df_final[new_order]

print(b_df_final)

       User_ID Article_ID Interaction
0      U100159    N118883           0
1      U100159     N60135           0
2      U100159    N100073           0
3      U100159    N113126           0
4      U100159    N116647           0
...        ...        ...         ...
70658   U96536     N91597           1
70659   U96536      N1713           1
70660   U96536     N59700           1
70661   U96536    N117709           1
70662   U96536     N36861           0

[70663 rows x 3 columns]


# Convert User_ID and Article ID to numeric values

In [40]:
b_df_final['Article_ID'] = b_df_final['Article_ID'].str.lstrip('N')
b_df_final['User_ID'] = b_df_final['User_ID'].str.lstrip('U')
print(b_df_final)

      User_ID Article_ID Interaction
0      100159     118883           0
1      100159      60135           0
2      100159     100073           0
3      100159     113126           0
4      100159     116647           0
...       ...        ...         ...
70658   96536      91597           1
70659   96536       1713           1
70660   96536      59700           1
70661   96536     117709           1
70662   96536      36861           0

[70663 rows x 3 columns]


In [41]:
#check for errors
mask = b_df_final['Article_ID'].str.len() > 6
filtered_df = b_df_final[mask]
print(filtered_df)

      User_ID       Article_ID Interaction
38159  454695  102458-0,N42649           0


In [42]:
b_df_final = b_df_final.drop(38177)

In [43]:
print(b_df_final)

      User_ID Article_ID Interaction
0      100159     118883           0
1      100159      60135           0
2      100159     100073           0
3      100159     113126           0
4      100159     116647           0
...       ...        ...         ...
70658   96536      91597           1
70659   96536       1713           1
70660   96536      59700           1
70661   96536     117709           1
70662   96536      36861           0

[70662 rows x 3 columns]


In [44]:
# Convert columns to numeric
b_df_final['User_ID'] = pd.to_numeric(b_df_final['User_ID'], errors='coerce')
b_df_final['Article_ID'] = pd.to_numeric(b_df_final['Article_ID'], errors='coerce')
b_df_final['Interaction'] = pd.to_numeric(b_df_final['Interaction'], errors='coerce')

In [45]:
b_df_final.isnull().sum()

User_ID        0
Article_ID     1
Interaction    0
dtype: int64

In [46]:
b_df_final = b_df_final.dropna()

In [47]:
print(b_df_final.shape)

(70661, 3)


In [48]:
print(b_df_final)
b_df_final.dtypes

       User_ID  Article_ID  Interaction
0       100159    118883.0            0
1       100159     60135.0            0
2       100159    100073.0            0
3       100159    113126.0            0
4       100159    116647.0            0
...        ...         ...          ...
70658    96536     91597.0            1
70659    96536      1713.0            1
70660    96536     59700.0            1
70661    96536    117709.0            1
70662    96536     36861.0            0

[70661 rows x 3 columns]


User_ID          int64
Article_ID     float64
Interaction      int64
dtype: object

In [49]:
df = b_df_final

# Model Building & Evaluation

# Logistic Regression Model:

In [50]:
# Features (User_ID and Article_ID) and Target (Interaction)
X = df[['User_ID', 'Article_ID']]
y = df[['Interaction']]

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression on known interactions (0s and 1s)
from sklearn.linear_model import LogisticRegression
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_reg_model.predict(X_test)

# Evaluate the model performance
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'F1 Score: {f1_score(y_test, y_pred)}')

  y = column_or_1d(y, warn=True)


Accuracy: 0.5083846317130121
Precision: 0.502968779927217
Recall: 0.37627167215933516
F1 Score: 0.4304918032786885


In [51]:
# Define a parameter grid to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['liblinear', 'saga'],  # 'saga' supports 'elasticnet'
    'max_iter': [100, 200, 300]
}

# Initialize logistic regression model
log_reg_model = LogisticRegression()

# Initialize GridSearchCV with logistic regression and the parameter grid
grid_search = GridSearchCV(estimator=log_reg_model, param_grid=param_grid, scoring='accuracy', cv=5)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Train the best model
best_log_reg_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_log_reg_model.predict(X_test)

# Evaluate the model performance
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'F1 Score: {f1_score(y_test, y_pred)}')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


150 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\josia\anaconda3\Desktop\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\josia\anaconda3\Desktop\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\josia\anaconda3\Desktop\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^

Best parameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best score: 0.5118702949817913
Accuracy: 0.5103658105143989
Precision: 0.5063182694367102
Recall: 0.3387304771457229
F1 Score: 0.4059065934065934


# Random Forest Classifier Model:

In [52]:
# Initialize and train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model performance
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'F1 Score: {f1_score(y_test, y_pred)}')
print(f'ROC AUC Score: {roc_auc_score(y_test, y_pred)}')

  return fit_method(estimator, *args, **kwargs)


Accuracy: 0.7098988183683578
Precision: 0.7168248230155144
Recall: 0.681902851411377
F1 Score: 0.6989278895579382
ROC AUC Score: 0.7095564019427586


# K Nearest Neighbors Classifier:

In [55]:
# Initialize and train KNN Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors
knn_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_model.predict(X_test)

# Evaluate the model performance
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'F1 Score: {f1_score(y_test, y_pred)}')
print(f'ROC AUC Score: {roc_auc_score(y_test, y_pred)}')

  return self._fit(X, y)


Accuracy: 0.5937168329441732
Precision: 0.5894950079583273
Recall: 0.5837512537612839
F1 Score: 0.5866090712742982
ROC AUC Score: 0.5935949447447739


# Next Steps

It appears that the random forest classifier has performed best at predicting which articles users will click on based on the click history of this sample. I may choose to increase my sample size from 1,000 to 10,000 in the future to see if additional data could improve accuracy. Moreover, the objective of this project was not to provide users with an algorithm that feeds them articles they would click, but rather, one that also exposes them to news sources outside their information bubble. I will move forward discussing with my mentor how to use additional algorithms and/or steps to implement this stated goal. Also, may do a bayesian grid search for the Random Forest model next. 