# Import Packages & Create Dataframes

In [1]:
#import packages
import pandas as pd
import pyspark

In [2]:
#import behaviors file
behaviors = pd.read_csv(r"C:\Users\josia\Desktop\Capstone_Project\MINDlarge_train\behaviors.tsv", sep='\t')

In [3]:
#convert behavior data to dataframe
b_df = pd.DataFrame(behaviors)

# Examine the behavior data

In [4]:
b_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2232747 entries, 0 to 2232746
Data columns (total 5 columns):
 #   Column                                                                                                                                                                     Dtype 
---  ------                                                                                                                                                                     ----- 
 0   1                                                                                                                                                                          int64 
 1   U87243                                                                                                                                                                     object
 2   11/10/2019 11:30:54 AM                                                                                                                                 

In [5]:
print(b_df.head())

   1   U87243  11/10/2019 11:30:54 AM  \
0  2  U598644   11/12/2019 1:45:29 PM   
1  3  U532401  11/13/2019 11:23:03 AM   
2  4  U593596  11/12/2019 12:24:09 PM   
3  5  U239687   11/14/2019 8:03:01 PM   
4  6  U521853  11/11/2019 10:47:31 AM   

  N8668 N39081 N65259 N79529 N73408 N43615 N29379 N32031 N110232 N101921 N12614 N129591 N105760 N60457 N1229 N64932  \
0  N56056 N8726 N70353 N67998 N83823 N111108 N107...                                                                  
1  N128643 N87446 N122948 N9375 N82348 N129412 N5...                                                                  
2  N31043 N39592 N4104 N8223 N114581 N92747 N1207...                                                                  
3  N65250 N122359 N71723 N53796 N41663 N41484 N11...                                                                  
4  N8668 N29136 N128643 N9740 N9375 N52911 N12090...                                                                  

  N78206-0 N26368-0 N7578-0 N58592-0 N

In [6]:
#Create column names based on dataset legend found elsewhere
b_column_names = ['Impression_ID', 'User_ID', 'Time', 'History', 'Impressions']
b_df.columns = b_column_names

In [7]:
print(b_df.head())

   Impression_ID  User_ID                    Time  \
0              2  U598644   11/12/2019 1:45:29 PM   
1              3  U532401  11/13/2019 11:23:03 AM   
2              4  U593596  11/12/2019 12:24:09 PM   
3              5  U239687   11/14/2019 8:03:01 PM   
4              6  U521853  11/11/2019 10:47:31 AM   

                                             History  \
0  N56056 N8726 N70353 N67998 N83823 N111108 N107...   
1  N128643 N87446 N122948 N9375 N82348 N129412 N5...   
2  N31043 N39592 N4104 N8223 N114581 N92747 N1207...   
3  N65250 N122359 N71723 N53796 N41663 N41484 N11...   
4  N8668 N29136 N128643 N9740 N9375 N52911 N12090...   

                                         Impressions  
0  N47996-0 N82719-0 N117066-0 N8491-0 N123784-0 ...  
1              N103852-0 N53474-0 N127836-0 N47925-1  
2  N38902-0 N76434-0 N71593-0 N100073-0 N108736-0...  
3  N76209-0 N48841-0 N67937-0 N62235-0 N6307-0 N3...  
4                N32154-0 N67747-0 N47257-0 N98178-1  


randomly sample^^ ??
store random sample into b_df

In [8]:
b_df = b_df.sample(n=1000, random_state=1)

In [9]:
print(b_df['Impressions'].head())

1837023    N67123-0 N100343-0 N23823-0 N8719-0 N11682-0 N...
1328700    N8599-0 N8236-0 N56873-0 N76665-0 N51163-0 N65...
1645076         N53474-0 N5287-0 N113723-1 N24701-1 N76677-0
2155197    N90184-0 N67937-0 N81374-0 N15403-1 N48015-0 N...
527583     N96356-0 N73137-0 N52435-0 N31766-0 N118589-0 ...
Name: Impressions, dtype: object


# Data Legend & Next Steps

# Wrangle the Behavior data

In [10]:
#Group data by User ID
b_df_g = b_df.sort_values(by = 'User_ID')

In [11]:
#Check for missing values
b_df_g_missing = b_df_g.isnull().sum()
print(b_df_g_missing)

Impression_ID     0
User_ID           0
Time              0
History          24
Impressions       0
dtype: int64


In [12]:
#Examine type of missing value
b_df_g_hist_miss = b_df_g['History'].isnull()

In [13]:
print(b_df_g_hist_miss)

746803     False
1788588    False
1044349    False
840718     False
1719838    False
           ...  
1095245    False
2134768    False
1927012    False
349753     False
552070     False
Name: History, Length: 1000, dtype: bool


In [14]:
#Drop all rows with missing value
b_df_g_drop = b_df_g.dropna()
print(b_df_g_drop.isnull().sum())

Impression_ID    0
User_ID          0
Time             0
History          0
Impressions      0
dtype: int64


In [15]:
#Check to see how many remaining rows with data there are
print(b_df_g_drop.shape[0])

976


In [16]:
#Drop 'Impression_ID' and 'Time' columns as they do not pertain to my use case
columns_to_drop = ['Impression_ID', 'Time']
b_df_g_c_drop = b_df_g_drop.drop(columns = columns_to_drop)
print(b_df_g_c_drop)

         User_ID                                            History  \
746803   U100159                                             N59700   
1788588  U100253  N94639 N104181 N6612 N86208 N1229 N10000 N6317...   
1044349  U100346  N90185 N61319 N15797 N65119 N94639 N52595 N769...   
840718   U100602  N67127 N31305 N63723 N75655 N82474 N105205 N71...   
1719838  U102239         N128965 N90630 N50274 N123400 N6930 N91597   
...          ...                                                ...   
1095245   U94628  N128965 N31043 N9375 N95100 N113932 N106155 N1...   
2134768   U95270  N27587 N15899 N93049 N72590 N71977 N91597 N915...   
1927012   U95990  N127337 N29265 N80126 N79285 N10766 N65299 N73...   
349753    U96367  N15833 N52075 N47788 N19175 N22633 N112687 N10...   
552070    U96536  N128965 N31043 N9375 N12326 N94639 N58992 N769...   

                                               Impressions  
746803   N2750-0 N59301-0 N60135-0 N116647-0 N92498-1 N...  
1788588  N123784-0 N88472

In [17]:
#rename data frame
b_df = b_df_g_c_drop

In [18]:
b_df.dtypes

User_ID        object
History        object
Impressions    object
dtype: object

In [19]:
#group data by User_ID
grouped_df = b_df.groupby('User_ID').agg({
    'History': lambda x: ','.join(x),  # Combine all 'History' values into one string
    'Impressions': lambda x: ','.join(x)  # Combine all 'Impressions' values into one string
}).reset_index()

# Step 2: Remove duplicates within each combined string
grouped_df['History'] = grouped_df['History'].apply(lambda x: ','.join(set(x.split(','))))
grouped_df['Impressions'] = grouped_df['Impressions'].apply(lambda x: ','.join(set(x.split(','))))

print(grouped_df)

     User_ID                                            History  \
0    U100159                                             N59700   
1    U100253  N94639 N104181 N6612 N86208 N1229 N10000 N6317...   
2    U100346  N90185 N61319 N15797 N65119 N94639 N52595 N769...   
3    U100602  N67127 N31305 N63723 N75655 N82474 N105205 N71...   
4    U102239         N128965 N90630 N50274 N123400 N6930 N91597   
..       ...                                                ...   
970   U94628  N128965 N31043 N9375 N95100 N113932 N106155 N1...   
971   U95270  N27587 N15899 N93049 N72590 N71977 N91597 N915...   
972   U95990  N127337 N29265 N80126 N79285 N10766 N65299 N73...   
973   U96367  N15833 N52075 N47788 N19175 N22633 N112687 N10...   
974   U96536  N128965 N31043 N9375 N12326 N94639 N58992 N769...   

                                           Impressions  
0    N2750-0 N59301-0 N60135-0 N116647-0 N92498-1 N...  
1    N123784-0 N88472-0 N100846-0 N72609-1 N46229-0...  
2    N128940-0 N125953-0

In [20]:
grouped_df.dtypes

User_ID        object
History        object
Impressions    object
dtype: object

In [21]:
# Function to add "-1" to each ID and join back with spaces
def add_suffix_to_ids(ids_string):
    ids_list = ids_string.split()  # Split the string of IDs into a list
    ids_list_with_suffix = [id + '-1' for id in ids_list]  # Add "-1" to each ID
    return ' '.join(ids_list_with_suffix)  # Join the modified IDs back with spaces

# Apply the function to the 'History' column
grouped_df['History'] = grouped_df['History'].apply(add_suffix_to_ids)

# Display the updated DataFrame
print(grouped_df)

     User_ID                                            History  \
0    U100159                                           N59700-1   
1    U100253  N94639-1 N104181-1 N6612-1 N86208-1 N1229-1 N1...   
2    U100346  N90185-1 N61319-1 N15797-1 N65119-1 N94639-1 N...   
3    U100602  N67127-1 N31305-1 N63723-1 N75655-1 N82474-1 N...   
4    U102239  N128965-1 N90630-1 N50274-1 N123400-1 N6930-1 ...   
..       ...                                                ...   
970   U94628  N128965-1 N31043-1 N9375-1 N95100-1 N113932-1 ...   
971   U95270  N27587-1 N15899-1 N93049-1 N72590-1 N71977-1 N...   
972   U95990  N127337-1 N29265-1 N80126-1 N79285-1 N10766-1 ...   
973   U96367  N15833-1 N52075-1 N47788-1 N19175-1 N22633-1 N...   
974   U96536  N128965-1 N31043-1 N9375-1 N12326-1 N94639-1 N...   

                                           Impressions  
0    N2750-0 N59301-0 N60135-0 N116647-0 N92498-1 N...  
1    N123784-0 N88472-0 N100846-0 N72609-1 N46229-0...  
2    N128940-0 N125953-0

In [22]:
#combine the history and impressions columns together
def combine_history_impressions(row):
    history_list = row['History'].split()  # Split 'History' string into a list of IDs
    impressions_list = row['Impressions'].split()  # Split 'Impressions' string into a list of IDs
    combined_list = list(set(history_list + impressions_list))  # Combine and remove duplicates
    combined_string = ' '.join(combined_list)  # Join the combined list back into a string
    return combined_string

# Apply the function to create a new 'Combined' column
grouped_df['Combined'] = grouped_df.apply(combine_history_impressions, axis=1)

# Display the updated DataFrame
print(grouped_df)

     User_ID                                            History  \
0    U100159                                           N59700-1   
1    U100253  N94639-1 N104181-1 N6612-1 N86208-1 N1229-1 N1...   
2    U100346  N90185-1 N61319-1 N15797-1 N65119-1 N94639-1 N...   
3    U100602  N67127-1 N31305-1 N63723-1 N75655-1 N82474-1 N...   
4    U102239  N128965-1 N90630-1 N50274-1 N123400-1 N6930-1 ...   
..       ...                                                ...   
970   U94628  N128965-1 N31043-1 N9375-1 N95100-1 N113932-1 ...   
971   U95270  N27587-1 N15899-1 N93049-1 N72590-1 N71977-1 N...   
972   U95990  N127337-1 N29265-1 N80126-1 N79285-1 N10766-1 ...   
973   U96367  N15833-1 N52075-1 N47788-1 N19175-1 N22633-1 N...   
974   U96536  N128965-1 N31043-1 N9375-1 N12326-1 N94639-1 N...   

                                           Impressions  \
0    N2750-0 N59301-0 N60135-0 N116647-0 N92498-1 N...   
1    N123784-0 N88472-0 N100846-0 N72609-1 N46229-0...   
2    N128940-0 N12595

In [23]:
#drop impressions and history columns
column_list = ['Impressions', 'History']
grouped_df.drop(columns = column_list, inplace=True)

In [24]:
print(grouped_df)

     User_ID                                           Combined
0    U100159  N18539-0 N83412-0 N92498-1 N52079-0 N115021-0 ...
1    U100253  N63174-1 N89409-0 N128367-0 N123784-0 N46229-0...
2    U100346  N95341-0 N8475-0 N96337-0 N49466-0 N86429-0 N1...
3    U100602  N63723-1 N76810-1 N108832-1 N50645-1 N102443-1...
4    U102239  N123234-0 N87855-0 N31879-0 N55792-0 N47026-0 ...
..       ...                                                ...
970   U94628  N9191-1 N64149-0 N91047-0 N425-0 N73137-0 N542...
971   U95270  N27587-1 N45410-0 N74875-1 N71977-1 N91597-1 N...
972   U95990  N72590-1 N125945-0 N65299-1 N31879-1 N127337-1...
973   U96367  N53271-0 N59654-0 N52075-1 N73137-0 N55579-0 N...
974   U96536  N39948-1 N58200-1 N112665-1 N425-0 N91597-1 N1...

[975 rows x 2 columns]


In [25]:
#renamed combined column
grouped_df.rename(columns={'Combined': 'Impressions'}, inplace=True)
print(grouped_df)

     User_ID                                        Impressions
0    U100159  N18539-0 N83412-0 N92498-1 N52079-0 N115021-0 ...
1    U100253  N63174-1 N89409-0 N128367-0 N123784-0 N46229-0...
2    U100346  N95341-0 N8475-0 N96337-0 N49466-0 N86429-0 N1...
3    U100602  N63723-1 N76810-1 N108832-1 N50645-1 N102443-1...
4    U102239  N123234-0 N87855-0 N31879-0 N55792-0 N47026-0 ...
..       ...                                                ...
970   U94628  N9191-1 N64149-0 N91047-0 N425-0 N73137-0 N542...
971   U95270  N27587-1 N45410-0 N74875-1 N71977-1 N91597-1 N...
972   U95990  N72590-1 N125945-0 N65299-1 N31879-1 N127337-1...
973   U96367  N53271-0 N59654-0 N52075-1 N73137-0 N55579-0 N...
974   U96536  N39948-1 N58200-1 N112665-1 N425-0 N91597-1 N1...

[975 rows x 2 columns]


In [26]:
#rename df back to b_df
b_df = grouped_df

In [27]:
#Check for missing values
b_df.isnull().sum()

User_ID        0
Impressions    0
dtype: int64

In [28]:
#Check for duplicates
b_df.duplicated().sum()

0

# Prepare for User-Item Matrix

In [29]:
#reassign dataframe
b_df_final = b_df

# Define batch size
batch_size = 1000  # Adjust according to memory constraints

# Process DataFrame in chunks
chunks = [b_df_final[i:i+batch_size] for i in range(0, len(b_df_final), batch_size)]
expanded_dfs = []

for chunk in chunks:
    expanded_df = chunk.assign(Impression_ID=chunk['Impressions'].str.split()).explode('Impression_ID')
    expanded_df.rename(columns={'User_ID': 'User_ID', 'Impression_ID': 'Impression_ID'}, inplace=True)
    expanded_dfs.append(expanded_df)

# Concatenate all expanded DataFrames
b_df_final = pd.concat(expanded_dfs, ignore_index=True)

print(b_df_final)


       User_ID                                        Impressions  \
0      U100159  N18539-0 N83412-0 N92498-1 N52079-0 N115021-0 ...   
1      U100159  N18539-0 N83412-0 N92498-1 N52079-0 N115021-0 ...   
2      U100159  N18539-0 N83412-0 N92498-1 N52079-0 N115021-0 ...   
3      U100159  N18539-0 N83412-0 N92498-1 N52079-0 N115021-0 ...   
4      U100159  N18539-0 N83412-0 N92498-1 N52079-0 N115021-0 ...   
...        ...                                                ...   
70657   U96536  N39948-1 N58200-1 N112665-1 N425-0 N91597-1 N1...   
70658   U96536  N39948-1 N58200-1 N112665-1 N425-0 N91597-1 N1...   
70659   U96536  N39948-1 N58200-1 N112665-1 N425-0 N91597-1 N1...   
70660   U96536  N39948-1 N58200-1 N112665-1 N425-0 N91597-1 N1...   
70661   U96536  N39948-1 N58200-1 N112665-1 N425-0 N91597-1 N1...   

      Impression_ID  
0          N18539-0  
1          N83412-0  
2          N92498-1  
3          N52079-0  
4         N115021-0  
...             ...  
70657     N117709

In [30]:
b_df_final.drop(['Impressions'], axis=1, inplace = True)

In [31]:
print(b_df_final)

       User_ID Impression_ID
0      U100159      N18539-0
1      U100159      N83412-0
2      U100159      N92498-1
3      U100159      N52079-0
4      U100159     N115021-0
...        ...           ...
70657   U96536     N117709-1
70658   U96536      N31043-1
70659   U96536      N82158-1
70660   U96536      N58992-1
70661   U96536     N100832-1

[70662 rows x 2 columns]


In [32]:
b_df_final.head(10000).tail()

Unnamed: 0,User_ID,Impression_ID
9995,U189932,N16614-1
9996,U189932,N35782-0
9997,U189932,N8991-1
9998,U189932,N75492-1
9999,U189932,N41975-0


In [33]:
b_df_final['Interaction'] = b_df_final['Impression_ID'].str[-1]

In [34]:
print(b_df_final)

       User_ID Impression_ID Interaction
0      U100159      N18539-0           0
1      U100159      N83412-0           0
2      U100159      N92498-1           1
3      U100159      N52079-0           0
4      U100159     N115021-0           0
...        ...           ...         ...
70657   U96536     N117709-1           1
70658   U96536      N31043-1           1
70659   U96536      N82158-1           1
70660   U96536      N58992-1           1
70661   U96536     N100832-1           1

[70662 rows x 3 columns]


In [35]:
b_df_final['Article_ID'] = b_df_final['Impression_ID'].str[:-2]

In [36]:
print(b_df_final)

       User_ID Impression_ID Interaction Article_ID
0      U100159      N18539-0           0     N18539
1      U100159      N83412-0           0     N83412
2      U100159      N92498-1           1     N92498
3      U100159      N52079-0           0     N52079
4      U100159     N115021-0           0    N115021
...        ...           ...         ...        ...
70657   U96536     N117709-1           1    N117709
70658   U96536      N31043-1           1     N31043
70659   U96536      N82158-1           1     N82158
70660   U96536      N58992-1           1     N58992
70661   U96536     N100832-1           1    N100832

[70662 rows x 4 columns]


In [37]:
b_df_final.drop(columns = ['Impression_ID'])

Unnamed: 0,User_ID,Interaction,Article_ID
0,U100159,0,N18539
1,U100159,0,N83412
2,U100159,1,N92498
3,U100159,0,N52079
4,U100159,0,N115021
...,...,...,...
70657,U96536,1,N117709
70658,U96536,1,N31043
70659,U96536,1,N82158
70660,U96536,1,N58992


In [38]:
new_order = ['User_ID', 'Article_ID', 'Interaction']

# Reorder the DataFrame columns
b_df_final = b_df_final[new_order]

print(b_df_final)

       User_ID Article_ID Interaction
0      U100159     N18539           0
1      U100159     N83412           0
2      U100159     N92498           1
3      U100159     N52079           0
4      U100159    N115021           0
...        ...        ...         ...
70657   U96536    N117709           1
70658   U96536     N31043           1
70659   U96536     N82158           1
70660   U96536     N58992           1
70661   U96536    N100832           1

[70662 rows x 3 columns]


# Convert User_ID and Article ID to numeric values

In [39]:
b_df_final['Article_ID'] = b_df_final['Article_ID'].str.lstrip('N')
b_df_final['User_ID'] = b_df_final['User_ID'].str.lstrip('U')
print(b_df_final)

      User_ID Article_ID Interaction
0      100159      18539           0
1      100159      83412           0
2      100159      92498           1
3      100159      52079           0
4      100159     115021           0
...       ...        ...         ...
70657   96536     117709           1
70658   96536      31043           1
70659   96536      82158           1
70660   96536      58992           1
70661   96536     100832           1

[70662 rows x 3 columns]


In [40]:
#check for errors
mask = b_df_final['Article_ID'].str.len() > 6
filtered_df = b_df_final[mask]
print(filtered_df)

      User_ID      Article_ID Interaction
38179  454695  91865-0,N91865           0


In [41]:
b_df_final = b_df_final.drop(38177)

In [42]:
print(b_df_final)

      User_ID Article_ID Interaction
0      100159      18539           0
1      100159      83412           0
2      100159      92498           1
3      100159      52079           0
4      100159     115021           0
...       ...        ...         ...
70657   96536     117709           1
70658   96536      31043           1
70659   96536      82158           1
70660   96536      58992           1
70661   96536     100832           1

[70661 rows x 3 columns]


In [43]:
# Convert columns to numeric
b_df_final['User_ID'] = pd.to_numeric(b_df_final['User_ID'], errors='coerce')
b_df_final['Article_ID'] = pd.to_numeric(b_df_final['Article_ID'], errors='coerce')
b_df_final['Interaction'] = pd.to_numeric(b_df_final['Interaction'], errors='coerce')

In [44]:
b_df_final.isnull().sum()

User_ID        0
Article_ID     1
Interaction    0
dtype: int64

In [45]:
b_df_final = b_df_final.dropna()

In [46]:
print(b_df_final.shape)

(70660, 3)


In [47]:
print(b_df_final)
b_df_final.dtypes

       User_ID  Article_ID  Interaction
0       100159     18539.0            0
1       100159     83412.0            0
2       100159     92498.0            1
3       100159     52079.0            0
4       100159    115021.0            0
...        ...         ...          ...
70657    96536    117709.0            1
70658    96536     31043.0            1
70659    96536     82158.0            1
70660    96536     58992.0            1
70661    96536    100832.0            1

[70660 rows x 3 columns]


User_ID          int64
Article_ID     float64
Interaction      int64
dtype: object

In [48]:
user_item_matrix = b_df_final.pivot_table(index='User_ID', columns='Article_ID', values='Interaction')

In [49]:
print(user_item_matrix)

Article_ID  14.0      39.0      60.0      62.0      68.0      75.0      \
User_ID                                                                  
1911             NaN       1.0       NaN       NaN       NaN       NaN   
2541             NaN       NaN       NaN       NaN       NaN       NaN   
2961             NaN       NaN       NaN       NaN       NaN       NaN   
3758             NaN       NaN       NaN       NaN       NaN       NaN   
5812             NaN       NaN       NaN       NaN       NaN       NaN   
...              ...       ...       ...       ...       ...       ...   
707491           NaN       NaN       NaN       NaN       NaN       NaN   
707631           NaN       NaN       NaN       NaN       NaN       NaN   
709048           NaN       NaN       NaN       NaN       NaN       NaN   
710638           NaN       NaN       NaN       NaN       NaN       NaN   
710997           NaN       NaN       NaN       NaN       NaN       NaN   

Article_ID  80.0      91.0      99.0 

# Import Model Packages

Collaborative filtering with ALS

In [50]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

In [51]:
from pyspark.sql import SparkSession
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Initialize Spark session

spark = SparkSession.builder \
    .appName("ALSExample") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .config("spark.driver.cores", "2") \
    .getOrCreate()

# Initialize Spark session
#spark = SparkSession.builder.appName("recommendation_system").getOrCreate()

# Convert pandas dataframe to spark dataframe
df = spark.createDataFrame(b_df_final)

# Splitting, Training, and Evaluating the Model

In [52]:
#Create test and train set
(training, test) = df.randomSplit([0.8, 0.2])

#Create ALS model
als = ALS(userCol='User_ID', itemCol='Article_ID', ratingCol='Interaction', 
         coldStartStrategy='drop', nonnegative = True)

In [53]:
#Tune model using ParamGridBuilder
param_grid = ParamGridBuilder()\
        .addGrid(als.rank, [12, 13, 14])\
        .addGrid(als.maxIter, [18,19,20])\
        .addGrid(als.regParam, [.17, .18, .19])\
        .build()
#Define evaluator as RMSE
evaluator = RegressionEvaluator(metricName='rmse', labelCol='Interaction', predictionCol='prediction')

#Build cross validation using TrainValidationSplit
tvs = TrainValidationSplit(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator)

In [54]:
#Fit ALS model to training data
model = tvs.fit(training)

In [55]:
#Extract best model from the tuning exercise using ParamGridBuilder
best_model = model.bestModel

In [57]:
#Generate predictions and evaluate using RMSE
predictions = best_model.transform(test)

In [59]:
rmse = evaluator.evaluate(predictions)

In [61]:
# Print evaluation metrics
print('RMSE =', rmse)

# Print model parameters
print('**Best Model**')

# Check the type of best_model to determine if it has the attributes you are looking for
print(' Rank:', getattr(best_model, 'rank', 'N/A'))
print(' MaxIter:', getattr(best_model._java_obj.parent(), 'getMaxIter', lambda: 'N/A')())
print(' RegParam:', getattr(best_model._java_obj.parent(), 'getRegParam', lambda: 'N/A')())

RMSE = 0.2603147310499995
**Best Model**
 Rank: 14
 MaxIter: 19
 RegParam: 0.17


Note on creating dummy features and standardizing/scaling the data:

In this case Using dummy features on binary data would not make sense. Binary data only has two possible values represening a click or no click on an article in this case. They are already interpretable and there is no need to convert them to dummy variables which would just duplicate the information and not add value. 

Additionally, scaling already binary data would not have any effect, and standardizing  binary data is not necessary and may in fact negatively impact our model by distoring the interpretation of the binary values. 