Building the dataset of numerical data

In [None]:
### PUT MAIN HERE ###

In [1]:
# Machine Learning Challenge
# Course: Machine Learning (880083-M-6)
# Group 58
 
##########################################
#             Import packages            #
##########################################
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import yake  #NOTE: with Anaconda: conda install -c conda-forge yake

##########################################
#      Import self-made functions        #
##########################################
from CODE.data_preprocessing.split_val import split_val
from CODE.data_preprocessing.find_outliers_tukey import find_outliers_tukey

#feature based on the title of the paper
from CODE.features.length_title import length_title

# features based on 'field_of_study' column 
from CODE.features.field_variety import field_variety         
from CODE.features.field_popularity import field_popularity
from CODE.features.field_citations_avarage import field_citations_avarage 

# features based on the topics of the paper
from CODE.features.topic_citations_avarage import topic_citations_avarage
from CODE.features.topic_variety import topics_variety
from CODE.features.topic_popularity import topic_popularity
from CODE.features.topic_citations_avarage import topic_citations_avarage

# features based on the abstract of the paper
from CODE.features.keywords import best_keywords
from CODE.features.abst_words import abst_words
from CODE.features.abst_words import abst_count

# features based on the venue of the paper
from CODE.features.venue_popularity import venue_popularity
from CODE.features.venue_citations import venues_citations

from CODE.features.age import age

# features based on the authors of the paper
from CODE.features.author_h_index import author_h_index
from CODE.features.paper_h_index import paper_h_index
from CODE.features.team_size import team_size
from CODE.features.author_database import author_database


##########################################
#              Load datasets             #
##########################################
# Main datasets
data = pd.read_json('DATA/train.json')      # Training set
test = pd.read_json('DATA/test.json')       # Test set

# Author-centric datasets
#   These datasets were made using our self-made functions 'citations_per_author' (for the author_citation_dic)
#   These functions took a long time to make (ballpark ~10 minutes on a laptop in 'silent mode'), so instead we 
#   decided to run this function once, save the data, and reload the datasets instead of running the function again. 
import pickle
with open('my_dataset1.pickle', 'rb') as dataset:
    author_citation_dic = pickle.load(dataset)
with open('my_dataset2.pickle', 'rb') as dataset2:
    author_db = pickle.load(dataset2)


##########################################
#        Missing values handling         #
##########################################

# Missing values for feature 'fields_of_study'
data.loc[data['fields_of_study'].isnull(), 'fields_of_study'] = ""

# Missing values for feature 'title'
data.loc[data['title'].isnull(), 'title'] = ""

# Missing values for feature 'abstract'
data.loc[data['abstract'].isnull(), 'abstract'] = ""
    
# Missing values for features 'authors'
data.loc[data['authors'].isnull(), 'authors'] = ""

# Missing values for feature 'venue'
data.loc[data['venue'].isnull(), 'venue'] = ""
    
# Missing values for feature 'year'
# data.loc[data['fields_of_study'].isnull(), 'fields_of_study'] = mean(year) 
        #   Take mean by venue instead
        #       If venue not known, take something else?

# Missing values for feature 'references'
data.loc[data['references'].isnull(), 'references'] = ""

# Missing values for feature 'topics'
data.loc[data['topics'].isnull(), 'topics'] = ""

# Missing values for feature 'is_open_access'
#data.loc[data['is_open_access'].isnull(), 'is_open_access'] = "" 
        #   Take most frequent occurrence for venue
        #       If venue not known, do something else?
    
##########################################
#       Create basic numeric df          #
##########################################
end = len(data)
num_X = data.loc[ 0:end+1 , ('doi', 'citations', 'year', 'references') ]  ##REMOVE DOI


##########################################
#            Feature creation            #
##########################################
"""
FEATURE DATAFRAME: num_X

ALL: After writing a funtion to create a feature, please incorporate your new feature as a column on the dataframe below.
This is the dataframe we will use to train the models.

DO NOT change the order in this section if at all possible
"""
num_X['title_length'] = length_title(data)      # returns a numbered series
num_X['field_variety'] = field_variety(data)    # returns a numbered series 
num_X['field_popularity'] = field_popularity(data) # returns a numbered series
num_X['field_citations_avarage'] = field_citations_avarage(data) # returns a numbered series
num_X['team_sz'] = team_size(data)           # returns a numbered series
num_X['topic_var'] = topics_variety(data)    # returns a numbered series
num_X['topic_popularity'] = topic_popularity(data) # returns a numbered series
num_X['topic_citations_avarage'] = topic_citations_avarage(data) # returns a numbered series
num_X['venue_popularity'], num_X['venue'] = venue_popularity(data)  # returns a numbered series and a pandas.Series of the 'venues' column reformatted 
num_X['open_access'] = pd.get_dummies(data["is_open_access"], drop_first = True)  # returns pd.df (True = 1)
num_X['age'] = age(data)               # returns a numbered series. Needs to be called upon AFTER the venues have been reformed (from venue_frequency)
num_X['venPresL'] = venues_citations(data)   # returns a numbered series. Needs to be called upon AFTER the venues have been reformed (from venue_frequency)
keywords = best_keywords(data, 1, 0.954, 0.955)    # from [data set] get [integer] keywords from papers btw [lower bound] and [upper bound] quantiles; returns list
num_X['has_keyword'] = abst_words(data, keywords)#returns a numbered series: 1 if any of the words is present in the abstract, else 0
num_X['keyword_count'] = abst_count(data, keywords) # same as above, only a count (noot bool)

# Author H-index
author_db, reformatted_authors = author_database(data)
data['authors'] = reformatted_authors
num_X['h_index'] = paper_h_index(data, author_citation_dic) # Returns a numbered series. Must come after author names have been reformatted.

field_avg_cit = num_X.groupby('field_variety').citations.mean()
for field, field_avg in zip(field_avg_cit.index, field_avg_cit):
    num_X.loc[num_X['field_variety'] == field, 'field_cit'] = field_avg


"""
END do not reorder
"""

##########################################
#    Deal with specific missing values   #
##########################################
# Open_access, thanks to jreback (27th of July 2016) https://github.com/pandas-dev/pandas/issues/13809
OpAc_by_venue = num_X.groupby('venue').open_access.apply(lambda x: x.mode()) # Take mode for each venue
OpAc_by_venue = OpAc_by_venue.to_dict()
missing_OpAc = num_X.loc[num_X['open_access'].isnull(),]
for i, i_paper in missing_OpAc.iterrows():
    venue = i_paper['venue']
    doi = i_paper['doi']
    index = num_X[num_X['doi'] == doi].index[0]
    if venue in OpAc_by_venue.keys():   # If a known venue, append the most frequent value for that venue
        num_X.loc[index,'open_access'] = OpAc_by_venue[venue] # Set most frequent occurrence 
    else:                               # Else take most occurring value in entire dataset
        num_X.loc[index,'open_access'] = num_X.open_access.mode()[0] # Thanks to BENY (2nd of February, 2018) https://stackoverflow.com/questions/48590268/pandas-get-the-most-frequent-values-of-a-column

# Year
year_by_venue = num_X.groupby('venue').year.apply(lambda x: x.mean()) # Take mean for each venue
year_by_venue = year_by_venue.to_dict()
missing_year = num_X.loc[num_X['year'].isnull(),]
for i, i_paper in missing_year.iterrows():
    venue = i_paper['venue']
    doi = i_paper['doi']
    index = num_X[num_X['doi'] == doi].index[0]
    if venue in year_by_venue.keys():   # If a known venue, append the mean value for that venue
        num_X.loc[index, 'year'] = year_by_venue[venue] # Set mean publication year
    else:                               # Else take mean value of entire dataset
        num_X.loc[index,'year'] = num_X.year.mean()
      
### Drop columns containing just strings
num_X = num_X.drop(['venue', 'doi', 'field_variety'], axis = 1)


##########################################
#    Outlier detection 1: threshold      #
##########################################
# 9658 rows in the full num_X
# 9494 rows with all turned on

# num_X = num_X[num_X['references'] < 500]
# num_X = num_X[num_X['team_sz'] < 40]
# num_X = num_X[num_X['topic_var'] < 60]
# num_X = num_X[num_X['venPresL'] < 300]
# num_X = num_X[num_X['h_index'] < 30]

#%store num_X

##########################################
#            Train/val split             #
##########################################
## train/val split
X_train, X_val, y_train, y_val = split_val(num_X, target_variable = 'citations')


##########################################
#     Outlier detection 2: Quantile      #
##########################################
### MODEL code for outlier detection
### names: X_train, X_val, y_train, y_val

# print(list(X_train.columns))

out_y = (find_outliers_tukey(x = y_train['citations'], top = 93, bottom = 0))[0]
out_rows = out_y

# out_X = (find_outliers_tukey(x = X_train['team_sz'], top = 99, bottom = 0))[0]
# out_rows = out_y + out_X

out_rows = sorted(list(set(out_rows)))
X_train = X_train.drop(labels = out_rows)
y_train = y_train.drop(labels = out_rows)

# Potential features to get rid of: team_sz; year and age are perfect correlates



In [None]:
"""
MOVE models here
"""

In [11]:
##########################################
#         Model implementations          #
##########################################
from CODE.models.regression import simple_linear
from CODE.models.regression import log_reg
from CODE.models.regression import sdg_reg
from CODE.models.regression import poly_reg
from CODE.models.regression import pois_reg
from CODE.models.non_linear import de_tree_reg
from CODE.models.non_linear import kn_reg
from CODE.models.non_linear import my_svr
from CODE.models.non_linear import mlp_reg
"""
IMPLEMENT models here: to run a model, delete the # and run
NOTE: Please do not modify X_train, X_val, y_train, y_val in your model - make new variables if needed
"""

#-----------simple regression, all columns
#simple_linear(X_train, y_train, X_val, y_val)

"""
MODEL RESULTS:
R2: 0.03724  
MSE: 33.38996
# Worse after extra outlier removal (0.015478)
"""
#-----------logistic regression, all columns
#log_reg(X_train, y_train, X_val, y_val)

"""
MODEL RESULTS:
R2: 0.006551953988217396
MSE: 34.07342328208346
# Worse after extra outlier removal (0.003)
"""
#-----------SGD regression, all columns
#sdg_reg (X_train, y_train, X_val, y_val)

"""
lr = [ 1, .1, .01, .001, .0001]
learning_rate in ['constant', 'optimal', 'invscaling']:
loss in ['squared_error', 'huber']:

# MODEL RESULTS:
# Best outcome, before extra outlier removal: ('constant', 0.01, 'squared_error', 35.74249957361433, 0.04476790061780822)
# Best outcome after extra outlier removal: ('constant', 0.01, 'squared_error', 37.08290449479669, 0.019303736163186702)
"""

#-----------polynomial regression, all columns
#poly_reg (X_train, y_train, X_val, y_val, 3)

"""
MODEL RESULTS:
r2: -0.05109 (degree = 2)
r2: -0.0378 (degree = 3)
r2: -5.5816 (degree = 4)
MAE 35.1660
"""

#-----------poisson regression, all columns
#pois_reg (X_train, y_train, X_val, y_val)

"""
MODEL RESULTS:
r2: 0.022145
MAE: 39.21127
"""

#-----------simple linear regression, dropping columns

"""
USE this code to run one of the simple regression models, successively dropping one column
To run, unhash the full function, then unhash the specific model
For a baseline, run the corresponding model above
"""
# summaries = list(X_train.columns)
# print(summaries)

# for i in range(len(summaries)):
#     X_train_small = X_train.copy()
#     X_val_small = X_val.copy()
#     drops = summaries[i]
#     X_train_small.drop(drops, inplace = True, axis=1)
#     X_val_small.drop(drops, inplace = True, axis=1)

#     print("dropped:", summaries[i])
    
#     #simple_linear(X_train_small, y_train, X_val_small, y_val)  #dropping venue_popularity helps a tiny bit
#     #log_reg(X_train_small, y_train, X_val_small, y_val)


#----------- Random Forrest for Regression
#de_tree_reg (X_train, y_train, X_val, y_val, 50)

"""
MODEL RESULTS:
r2: 0.006518029337933218  depth = 2
r2: 0.010480933407271853  depth = 3
r2: 0.013140361155744351  depth = 4
r2: 0.02475733890010956   depth = 10
r2: 0.027754095018432956  depth = 20
r2: 0.028205843489561455  depth = 30
r2: 0.02787632669251372  depth = 50
"""

#----------- K-Neighbors for Regression
#kn_reg (X_train, y_train, X_val, y_val, neighbors = 20, algorithm = 'auto', leaf_sz = 30)
"""
OPTIONS:
algorithm = 'auto', 'ball_tree', 'kd_tree', 'brute'
DEFAULT values: neighbors = 5, algorithm = 'auto', leaf_sz = 30

MODEL RESULTS:
r2: 0.0020787461461421186  neighbors = 2
r2: 0.0036641038448516072  neighbors = 3
r2: 0.012151620462786172   neighbors = 10
r2: 0.012527572947568677   neighbors = 20
"""

#-----------  Multi-layer Perceptron for Regression
mlp_reg (X_train, y_train, X_val, y_val, maxit=500, activation='relu', solver='adam', alpha=0.0001, lr='constant') 
"""
OPTIONS:
activation= 'identity', 'logistic', 'tanh', 'relu'
solver= 'lbfgs', 'sgd', 'adam'
lr= 'constant', 'invscaling', 'adaptive'
DEFAULT values: maxit=500, activation='relu', solver='adam', alpha=0.0001, lr='constant'

MODEL RESULTS:
r2: 0.005729150866153665
score: 0.005729150866153665
"""


#----------- Odds and Ends
#model.fit(X_train, y_train)
#print('Best score: ', model.best_score_)
#print('Best parameters: ', model.best_params_)
#y_pred = model.predict(X_val)

#from sklearn.metrics import r2_score
#print(r2_score(y_val,y_pred))


# import json
#with open("sample.json", "w") as outfile:
    #json.dump(dictionary, outfile)


  y = column_or_1d(y, warn=True)


r2: 0.005729150866153665
score: 0.005729150866153665



"\nOPTIONS:\nactivation= 'identity', 'logistic', 'tanh', 'relu'\nsolver= 'lbfgs', 'sgd', 'adam'\nlr= 'constant', 'invscaling', 'adaptive'\nDEFAULT values: maxit=500, activation='relu', solver='adam', alpha=0.0001, lr='constant'\n\nMODEL RESULTS:\nr2: 0.005729150866153665\nscore: 0.005729150866153665\n"

In [None]:
"""
-----------------------------------------------------------------------------------------------------------
------------------------------ EXPLORE and VISUALIZE ------------------------------------------------------
-----------------------------------------------------------------------------------------------------------
"""
"""
"""

In [None]:
#### STOP - ONLY if needed
# Allows printing full text
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [4]:
### FOR: exploring the new dataframe with numerical columns
num_X

Unnamed: 0,citations,year,references,title_length,field_popularity,field_citations_avarage,team_sz,topic_var,topic_popularity,topic_citations_avarage,venue_popularity,open_access,age,venPresL,has_keyword,keyword_count,h_index,field_cit
0,60,2015.0,39,10.0,9394,37.902597,6,1,75,45.186667,2005,1,6.0,70.793257,1,16,16.0,36.829665
1,1,2020.0,44,18.0,9394,37.902597,5,0,10,4.134021,8,1,1.0,3.75,1,25,1.0,36.829665
2,5,2017.0,30,8.0,9394,37.902597,3,5,344,43.519941,116,1,4.0,12.702479,1,19,6.0,36.829665
3,5,2017.0,11,13.0,9394,37.902597,2,6,1019,57.829761,68,1,4.0,10.797101,1,14,5.0,36.829665
4,10,2015.0,26,5.0,9394,37.902597,2,23,1131,60.718362,30,1,6.0,4.064516,1,46,3.0,36.829665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9653,8,2014.0,25,10.0,9394,37.902597,4,11,230,57.869379,9,1,7.0,3.444444,1,14,5.0,36.829665
9654,1,2019.0,18,7.0,9394,37.902597,4,3,186,50.620347,462,1,2.0,9.042857,1,21,4.0,36.829665
9655,1,2021.0,12,16.0,9394,37.902597,2,0,10,4.134021,12,0,0.0,1.166667,1,12,1.0,36.829665
9656,3,2021.0,15,15.0,586,2.764706,4,0,10,4.134021,9,0,0.0,1.777778,1,17,2.0,2.764706


In [None]:
"""
Look at some correlations - full num_X
"""
# names: X_train, X_val, y_train, y_val

# From: https://www.kaggle.com/ankitjha/comparing-regression-models
import seaborn as sns
corr_mat = num_X.corr(method='pearson')
plt.figure(figsize=(20,10))
sns.heatmap(corr_mat,vmax=1,square=True,annot=True,cmap='cubehelix')


In [None]:
"""
Look at some correlations - X_train
NOTE: there is no y here
"""
# names: X_train, X_val, y_train, y_val

#temp = y_train hstack X_train


# From: https://www.kaggle.com/ankitjha/comparing-regression-models
corr_mat = X_train.corr(method='pearson')
plt.figure(figsize=(20,10))
sns.heatmap(corr_mat,vmax=1,square=True,annot=True,cmap='cubehelix')

In [None]:
"""
Plots of each column against y
"""
import matplotlib as plt
num_X.plot.scatter(x="year", y="citations", alpha=0.5)
num_X.plot.scatter(x="references", y="citations", alpha=0.5)  # might have 3 outliers
num_X.plot.scatter(x="title_length", y="citations", alpha=0.5) # anything over 30 as outlier?
num_X.plot.scatter(x="team_sz", y="citations", alpha=0.5)  # might have 3 outliers
num_X.plot.scatter(x="topic_var", y="citations", alpha=0.5) # one outlier; maybe anything over 40
num_X.plot.scatter(x="topic_popularity", y="citations", alpha=0.5)
num_X.plot.scatter(x="topic_citations_average", y="citations", alpha=0.5)
num_X.plot.scatter(x="venue_popularity", y="citations", alpha=0.5)
num_X.plot.scatter(x="open_access", y="citations", alpha=0.5)
num_X.plot.scatter(x="age", y="citations", alpha=0.5)
num_X.plot.scatter(x="venPresL", y="citations", alpha=0.5)  # anything over 300 as outlier?
num_X.plot.scatter(x="has_keyword", y="citations", alpha=0.5)
num_X.plot.scatter(x="keyword_count", y="citations", alpha=0.5)
num_X.plot.scatter(x="h_index", y="citations", alpha=0.5)  # anything over 35 as outlier?
num_X.plot.scatter(x="field_cit", y="citations", alpha=0.5)

In [None]:
### FOR: explore keyword generation
# names: X_train, X_val, y_train, y_val
print("number of keywords:", len(keywords))
print("total train rows:", X_train.shape)
print("numer w keyword:", sum(X_train['has_keyword']))
print()
print(keywords)

# Results are pretty useless as is
#6210 of 6313
#6136 (of 6313) for 1 keyword from the top 1% of papers
#4787 for 2 keywords from top .01% of papers (correlation: 0.036)
#2917 for 1 keyword from top .01% of papers (correlation: 0.049)

In [None]:
"""
-----------------------------------------------------------------------------------------------------------
------------------------- TEST CODE -----------------------------------------------------------------------
-----------------------------------------------------------------------------------------------------------
"""
"""
"""

In [None]:
"""
create a play copy of the training data
"""
play = X_train.copy()
print(list(X_train.columns))
print(X_train.shape)
print(play.shape)

In [None]:
"""
Create a random mini version of the main 'data' dataframe
"""
import pandas as pd
import numpy as np
mini = data.sample(100, replace = False, axis = 0, random_state = 123)  
print(mini.shape)
print(list(mini.columns))

In [None]:
"""
Choose your columns
"""

#X_train_small = X_train.loc[ : , 'topic_var':'h_index'].copy()
#X_val_small = X_val.loc[ : , 'topic_var':'h_index'].copy()

drops = ['year', 'team_sz', 'has_keyword']
X_train_small = X_train.copy()
X_train_small.drop(drops, inplace = True, axis=1)

X_val_small = X_val.copy()
X_val_small.drop(drops, inplace = True, axis=1)

In [None]:
"""
This doesn't work: there are a different number of words in each list.
Maybe just take the top... 20?
"""

def abst_categories (the_data, keywords, mid_keywords, low_keywords):
    abst = the_data['abstract']
    counts = []
    abst_key = []    

    for i in abst:
        if i == None:
            abst_key.append(0)
            continue
        else:
            high = 0
            for word in keywords:
                if word in i.lower():
                    high += 1
            
            mid = 0
            for word in mid_keywords:
                if word in i.lower():
                    mid += 1

            low = 0
            for word in low_keywords:
                if word in i.lower():
                    low +=1
       
#        abst_key = np.argmax(abst_key)
#        abst_key = (max(abst_key)).index

    return pd.Series(abst_key)    


In [2]:
from sklearn.linear_model import PoissonRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s = scaler.transform(X_val)

model = PoissonRegressor()
reg = model.fit(X = X_train_s, y = y_train)
y_pred_val = reg.predict(X_val_s)

print('r2:', r2_score(y_val, y_pred_val))
print("MAE:", mean_absolute_error(y_val, y_pred_val))

"""
MODEL RESULTS:
r2: 0.022145
MAE: 39.21127
"""


r2: -22351073067.236362
MAE: 554544.8671483811


  y = column_or_1d(y, warn=True)


'\nMODEL RESULTS:\nr2: 0.022145\nMAE: 39.21127\n'

In [None]:
"""
sklearn:
SVR
KNeighborsRegressor()
DecisionTreeRegression()
"""

"""
from sklearn.svm import SVC  - classifier
#... load the data into X,y
model = SVC(kernel='poly')
model.fit(X,y)
"""


