In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import xgboost
from sklearn.cluster import KMeans
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

import seaborn as sns
from sklearn.feature_selection import SelectKBest

df_sub = pd.read_csv('./train_submissions.csv')
df_prob = pd.read_csv('./problem_data.csv')
df_user = pd.read_csv('./user_data.csv')
test= pd.read_csv('./test_submissions_NeDLEvX.csv')


#check=pd.read_csv('./sample_submissions_wbscxqU.csv')

In [2]:
test.shape

(66555, 3)

In [3]:
df_sub.head()

Unnamed: 0,user_id,problem_id,attempts_range
0,user_232,prob_6507,1
1,user_3568,prob_2994,3
2,user_1600,prob_5071,1
3,user_2256,prob_703,1
4,user_2321,prob_356,1


In [4]:
df_user.head()

Unnamed: 0,user_id,submission_count,problem_solved,contribution,country,follower_count,last_online_time_seconds,max_rating,rating,rank,registration_time_seconds
0,user_3311,47,40,0,,4,1504111645,348.337,330.849,intermediate,1466686436
1,user_3028,63,52,0,India,17,1498998165,405.677,339.45,intermediate,1441893325
2,user_2268,226,203,-8,Egypt,24,1505566052,307.339,284.404,beginner,1454267603
3,user_480,611,490,1,Ukraine,94,1505257499,525.803,471.33,advanced,1350720417
4,user_650,504,479,12,Russia,4,1496613433,548.739,486.525,advanced,1395560498


In [5]:
df_prob.head()

Unnamed: 0,problem_id,level_type,points,tags
0,prob_3649,H,,
1,prob_6191,A,,
2,prob_2020,F,,
3,prob_313,A,500.0,"greedy,implementation"
4,prob_101,A,500.0,"constructive algorithms,greedy,math"


In [6]:
test.head()

Unnamed: 0,ID,user_id,problem_id
0,user_856_prob_5822,user_856,prob_5822
1,user_2642_prob_2334,user_2642,prob_2334
2,user_2557_prob_2920,user_2557,prob_2920
3,user_1572_prob_4598,user_1572,prob_4598
4,user_295_prob_6139,user_295,prob_6139


In [7]:
# to split tags column into words and use each word as a feature
def sep(x):
    dic = {}
    l=[]
    for j in range(x.shape[0]):
        n= len(x[j].split(','))
        dic={}
        for i in range(n):
            dic.update({ x[j].split(',')[i].strip(): 1})
        l.append(dic)
    return l
        
         
        
    

In [8]:
def processing(df_sub,df_user,df_prob):
    
    df_prob.loc[  df_prob['tags'].isnull(),'tags']  = ' '

    a=sep(df_prob['tags'])

    df_tags= pd.DataFrame(a)

    df_tags.fillna(0,inplace=True)

    df_prob = df_prob.join(df_tags)
    
    
    df_user['country'].fillna(df_user['country'].mode()[0],inplace= True)

    le = LabelEncoder()
    df_user['country'] = le.fit_transform(df_user['country'])
    
    train = df_sub.merge( df_prob, left_on= 'problem_id',right_on='problem_id', how='left')
    
    train = train.merge(df_user, left_on='user_id', right_on='user_id', how= 'left')

    train.drop(['tags'],axis=1,inplace=True)

    train['level_type'] = train['level_type'].fillna(train['level_type'].mode()[0])


    dic = {'A': 0, 'B' : 1,'C': 2,'D': 3,'E': 4,'F': 5,'G': 6,'H': 7,'I': 8,'J': 9,'K': 10,'L': 11,'M': 12,'N': 13 }

    train['level_type'] = train['level_type'].map(dic)

    points_fill = {0:500,1:1000,2:1500,3:2000,4:2500,5:2750,6:3000,7:3250,8:3500,9:3750,10:4000,11:4250,12:4500,13:4750}

    train = train.join( train[train['points'].isnull()]['level_type'].map(points_fill), how='left',rsuffix='ch')

    train.loc[  train['points'].isnull(),'points']  = 0
    train.loc[  train['level_typech'].isnull(),'level_typech']  = 0

    train['points_final'] = train['points'] + train['level_typech']  
    
    rank_fill = {'beginner':0, 'intermediate':1, 'advanced':2, 'expert':3}
    
    train['rank'] = train['rank'].map(rank_fill)
    
    train.drop(['points','level_typech'],axis=1,inplace=True)

    X = train.iloc[:,3:]
    X=X.drop([''],axis=1)
    
    try:
        y = train['attempts_range']
        return X,y
    except:
        return X
        
    

In [9]:
X,y = processing(df_sub,df_user,df_prob) 


In [10]:
test_X = processing(test,df_user,df_prob)

In [11]:
X.columns

Index(['level_type', '*special', '2-sat', 'binary search', 'bitmasks',
       'brute force', 'chinese remainder theorem', 'combinatorics',
       'constructive algorithms', 'data structures', 'dfs and similar',
       'divide and conquer', 'dp', 'dsu', 'expression parsing', 'fft', 'flows',
       'games', 'geometry', 'graph matchings', 'graphs', 'greedy', 'hashing',
       'implementation', 'math', 'matrices', 'meet-in-the-middle',
       'number theory', 'probabilities', 'schedules', 'shortest paths',
       'sortings', 'string suffix structures', 'strings', 'ternary search',
       'trees', 'two pointers', 'submission_count', 'problem_solved',
       'contribution', 'country', 'follower_count', 'last_online_time_seconds',
       'max_rating', 'rating', 'rank', 'registration_time_seconds',
       'points_final'],
      dtype='object')

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.15)

std = StandardScaler()


std.fit(X_train)

X1_train = std.transform(X_train)

In [25]:
X1_test = std.transform(X_test)

## Models

In [13]:
y_train.value_counts()

1    70262
2    40288
3    12021
4     4726
6     2597
5     2106
Name: attempts_range, dtype: int64

In [14]:
y.value_counts()

1    82804
2    47320
3    14143
4     5499
6     3033
5     2496
Name: attempts_range, dtype: int64

In [30]:
X_resampled, y_resampled = SMOTETomek(ratio={4:12000,5:12000,6:12000}).fit_sample(X1_train, y_train)
#X_resampled, y_resampled = SMOTE().fit_sample(X1_train, y_train)
#X_resampled, y_resampled = RandomUnderSampler(ratio={1:60000}).fit_sample(X_train, y_train)
#X_resampled, y_resampled = TomekLinks().fit_sample(X_resampled, y_resampled)

In [31]:
pd.Series(y_resampled).value_counts()

1    61769
2    32059
6    11615
5    11606
4    11013
3     8735
dtype: int64

In [32]:
cf= RandomForestClassifier(n_jobs=-1,n_estimators=100,max_features=0.3,min_samples_leaf=3)
cf.fit(X_resampled,y_resampled)




RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.3, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [37]:
X_train.columns

Index(['level_type', '*special', '2-sat', 'binary search', 'bitmasks',
       'brute force', 'chinese remainder theorem', 'combinatorics',
       'constructive algorithms', 'data structures', 'dfs and similar',
       'divide and conquer', 'dp', 'dsu', 'expression parsing', 'fft', 'flows',
       'games', 'geometry', 'graph matchings', 'graphs', 'greedy', 'hashing',
       'implementation', 'math', 'matrices', 'meet-in-the-middle',
       'number theory', 'probabilities', 'schedules', 'shortest paths',
       'sortings', 'string suffix structures', 'strings', 'ternary search',
       'trees', 'two pointers', 'submission_count', 'problem_solved',
       'contribution', 'country', 'follower_count', 'last_online_time_seconds',
       'max_rating', 'rating', 'rank', 'registration_time_seconds',
       'points_final'],
      dtype='object')

In [33]:
cf.feature_importances_

array([7.01937749e-02, 2.84784349e-03, 3.80718401e-04, 9.04929182e-03,
       1.61100054e-03, 1.56512257e-02, 1.11433409e-05, 1.92258474e-03,
       9.39642017e-03, 9.31235976e-03, 5.67372069e-03, 8.04612287e-04,
       1.01293908e-02, 2.24229545e-03, 2.86696234e-04, 1.28557749e-04,
       2.59054891e-04, 1.69685721e-03, 3.14610435e-03, 3.83875418e-04,
       3.62723138e-03, 1.49674582e-02, 2.11626409e-03, 1.98186318e-02,
       1.54045409e-02, 4.47167019e-04, 2.90599677e-04, 7.55252706e-03,
       3.45282372e-03, 3.39761005e-05, 1.96462293e-03, 8.56015481e-03,
       4.43075502e-04, 7.57509216e-03, 6.65061218e-04, 4.55360423e-03,
       3.37817763e-03, 7.15990281e-02, 7.21494935e-02, 4.44436647e-02,
       5.31765574e-02, 7.37178922e-02, 8.41455582e-02, 8.52382743e-02,
       8.67954220e-02, 3.42366903e-02, 8.52028894e-02, 6.93159936e-02])

In [34]:
confusion_matrix(cf.predict(X_resampled),y_resampled)

array([[59931, 10028,  4233,  1630,   650,   669],
       [ 1656, 21849,  1416,   478,   218,   210],
       [   29,    32,  2904,    16,     9,    12],
       [   40,    39,    66,  8841,    10,    14],
       [   38,    32,    43,    20, 10704,    18],
       [   75,    79,    73,    28,    15, 10692]])

In [35]:
cf.score(X_resampled,y_resampled)

0.8400842123730784

In [36]:
cf.score(X1_test,y_test)

0.5219575016097875

In [28]:
confusion_matrix(cf.predict(X1_test),y_test)

array([[8544, 3812,  966,  328,  140,  139],
       [3021, 2257,  758,  260,  129,  131],
       [ 582,  569,  219,   83,   60,   69],
       [ 192,  191,   72,   48,   27,   35],
       [  77,   89,   45,   18,   11,   15],
       [ 126,  114,   62,   36,   23,   47]])

In [38]:
f1_score(y_test,cf.predict(X1_test),average='weighted')

0.4593393098219673

In [39]:
test_X1 = std.transform(test_X)

In [40]:
test['pred'] = pd.Series(cf.predict(test_X1))

In [41]:
test.shape

(66555, 4)

In [42]:
test['pred'].value_counts()

1    51034
2    13093
6      888
4      642
5      573
3      325
Name: pred, dtype: int64

In [43]:
#test.to_csv('AV_pred.csv')