In [37]:
# project: p7
# submitter: lliu356
# partner: none
# hours: 4

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from sklearn.compose import make_column_transformer
import netaddr
import copy
import os

class UserPredictor():
    
    def __init__(self):
        self.model = Pipeline([('both',make_column_transformer((OneHotEncoder(),['badge']),
                                                               (OneHotEncoder(handle_unknown = 'ignore'),['region']),
                                                               (PolynomialFeatures(degree = 2,include_bias=False),['weighted_time']),
                                                               remainder = "passthrough")),
                               ('cls', LogisticRegression(fit_intercept=False))])       
        self.features_cols = ['region','badge','age','past_purchase_amt','weighted_time']
        self.y = 'y'
        
    def process_raw(self,users,logs, y = None):
        ### user:
        user_tmp = users #.copy()
        
        ### logs: match region
        ip2check_test = list(logs['ip_address'])
        ip_df = self.ip2location_load()
        match_region_out = self.ip_check(ip2check_test,ip_df)
        logs_add_region = logs #.copy()
        logs_add_region['region'] = match_region_out
        logs_match_region = logs_add_region[['id','region']].drop_duplicates()
        
        ### logs: match weigted time 
        page_type = list(logs['url_visited'])
        
        ### trnsform url (manually)
        url_transform = []
        #url_ref = {}
        for i in range(0,len(page_type)):
            tmp = page_type[i].replace('.html','').replace('/','')
            url_transform.append(tmp)
            #url_ref[tmp] = url_ref.setdefault(tmp,0) + 1
        
        # Any better idea than hardcoding these categories...?
        laptop = ['laptop']; office = ['tablet','keyboard','monitor','printer','desk']
        
        ### weight time
        page_type = list(logs['url_visited'])
        minutes_raw = list(logs['minutes_on_page'])
        minutes_weighted = [None] * len(minutes_raw)
        
        for i in range(0,len(page_type)):
            tmp = page_type[i].replace('.html','').replace('/','')
            if tmp in laptop:
                minutes_weighted[i] = minutes_raw[i] * 0.7
            elif tmp in office:
                minutes_weighted[i] = minutes_raw[i] * 0.2
            elif tmp == 'NotFound':
                minutes_weighted[i] = 0
        # extract        
        logs_match_weight = logs#.copy()
        logs_match_weight['weighted_time'] = minutes_weighted
        logs_match_weight = pd.DataFrame(logs_match_weight.groupby('id')['weighted_time'].sum())
        
        ### merge:
        user_wtime = pd.merge(user_tmp,logs_match_weight,on = 'id',how='left').fillna(0)
        out = pd.merge(user_wtime,logs_match_region,on = 'id',how='left').fillna('NotFound')
        if not y is None:
            out = pd.merge(out,y,how='left',on = 'id')
            out['y'] = out['y'].map({0:False,1:True})
            return out
        else:
            return out

    def fit(self, train_users, train_logs, train_y, crs_val = False):
        
        self.features_train = self.process_raw(train_users, train_logs, train_y)
        
        self.model.fit(self.features_train[self.features_cols],self.features_train[self.y])
        
        if crs_val:
            crs_val_scores  = cross_val_score(self.model,
                                              self.features_train[self.features_cols],
                                              self.features_train[self.y])
            print(f"AVG: {crs_val_scores.mean()}, STD: {crs_val_scores.std()}\n")
            
    def predict(self,test_users, test_logs):
        self.features_test = self.process_raw(test_users, test_logs)
        y_predicted = self.model.predict(self.features_test[self.features_cols])
                       
        return np.asarray(list(map(int, y_predicted)))
    
    def BinarySearch(self,arr,x):
        left, right = 0, len(arr) - 1
        while left < right:
            mid = left + (right - left) // 2
            if x - arr[mid] > arr[mid + 1] - x:
                left = mid + 1
            else:
                right = mid
        if arr[left-1] <= x and arr[left] > x:
            return left - 1
        else:
            return left
                       
    def ip2location_load(self):
        with open(os.path.join('data','ip2location.csv')) as f:
            out_raw = f.read()
        out = out_raw.split("\n")
        ip_raw = []
        for line in out:
            line_tmp = line.split(",")
            ip_raw.append(line_tmp)    
        ip_df = pd.DataFrame(ip_raw)
        ip_df.rename(columns = ip_df.iloc[0], inplace = True)
        ip_df.drop(ip_df.index[0],inplace = True)
        ip_df.drop(ip_df.index[-1],inplace = True) # something weird
        ip_df.sort_values(by = ["low"]) # sort for binary search
        ip_df.reset_index(drop = True,inplace = True)
        return ip_df
    
    def ip_check(self,ips,ip_df):
        search_sequence = list(map(int, ip_df["low"]))
        ip_match_out = []
        for ip in ips:    
            try:
                int_ip = int(netaddr.IPAddress(ip))
                index_tmp = self.BinarySearch(search_sequence,int_ip)
                matched_region = ip_df.iloc[index_tmp,3]
                ip_match_out.append(matched_region)  
            except:
                print("Execution halted...")
                break
        return ip_match_out

In [34]:
users = pd.read_csv("data/train_users.csv")
logs = pd.read_csv("data/train_logs.csv")

page_type = list(logs['url_visited'])

        
### trnsform url (manually)
url_transform = []
#url_ref = {}
for i in range(0,len(page_type)):
    tmp = page_type[i].replace('.html','').replace('/','')
    url_transform.append(tmp)
        
        # Any better idea than hardcoding these categories...?
laptop = ['laptop']; office = ['tablet','keyboard','monitor','printer','desk']
        
        ### weight time
page_type = list(logs['url_visited'])
minutes_raw = list(logs['minutes_on_page'])
minutes_weighted = [None] * len(minutes_raw)
        
for i in range(0,len(page_type)):
    tmp = page_type[i].replace('.html','').replace('/','')
    if tmp in laptop:
        minutes_weighted[i] = minutes_raw[i] * 0.7
    elif tmp in office:
        minutes_weighted[i] = minutes_raw[i] * 0.2
    else:
        minutes_weighted[i] = minutes_raw[i] * 0.1

logs_match_weight = logs#.copy()
logs_match_weight['weighted_time'] = minutes_weighted
logs_match_weight = pd.DataFrame(logs_match_weight.groupby('id')['weighted_time'].sum())
logs

Unnamed: 0,date,id,ip_address,url_visited,minutes_on_page,weighted_time
0,6-14-2020,19083,123.196.90.97,/blender.html,7.222454,0.722245
1,8-22-2020,764,123.50.162.196,/cleats.html,5.472573,0.547257
2,9-14-2020,733,192.141.247.60,/tablet.html,8.396012,1.679202
3,8-27-2020,16282,203.28.112.62,/keyboard.html,4.139941,0.827988
4,7-29-2020,5694,23.78.141.31,/cleats.html,11.204377,1.120438
...,...,...,...,...,...,...
199995,7-14-2020,10299,108.61.207.238,/cooler.html,17.208746,1.720875
199996,4-17-2020,981,45.124.21.4,/basketball.html,10.926431,1.092643
199997,9-27-2020,10451,103.15.97.131,/monitor.html,11.502494,2.300499
199998,4-13-2020,2885,203.26.56.100,/keyboard.html,5.056544,1.011309


In [28]:
t = pd.merge(users,logs_match_weight,how = 'left',on = 'id').fillna(0)
t['weighted_time'].isnull().values.any()
#out = pd.merge(,logs_match_weight,on = 'id')

False

In [38]:
model = UserPredictor()
train_users = pd.read_csv("data/train_users.csv")
train_logs = pd.read_csv("data/train_logs.csv")
train_y = pd.read_csv("data/train_y.csv")
model.fit(train_users, train_logs, train_y, crs_val = True)

   id             name  age   badge  past_purchase_amt  weighted_time region
0   1      Nikki Young   36  bronze              42.94      12.976759  Japan
1   2    William Moats   40    gold              10.03       6.472002  China
2   3       John Lemke   24  silver             203.37      15.156423  China
3   4  Elizabeth Gavin   39  bronze             132.04       3.521110  Japan
4   5     Myrtle Blais   37  bronze              41.56      12.304702  Japan


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

AVG: 0.9243500000000001, STD: 0.00753093619678191



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [39]:
test_users = pd.read_csv("data/test1_users.csv")
test_logs = pd.read_csv("data/test1_logs.csv")
y_pred = model.predict(test_users, test_logs)
len(y_pred)

       id              name  age   badge  past_purchase_amt  weighted_time  \
0  100001       William Lee   33  bronze              10.12       3.298134   
1  100002     Ernest Glover   37  bronze              21.22       5.270474   
2  100003    James Thompson   31  silver              18.02      15.125220   
3  100004      Lillie Yates   33    gold              43.41      46.534725   
4  100005  George Schaeffer   32  bronze              11.55       5.060739   

                     region  
0  United States of America  
1                    Mexico  
2                     Japan  
3                     Japan  
4  United States of America  


20000