Building the dataset of numerical data

In [None]:
### PUT MAIN HERE ###

In [7]:
"""
SETUP
"""
### Import models
import pandas as pd
import numpy as np

### Import self-made functions
from CODE.data_preprocessing.split_val import split_val
from CODE.features.length_title import length_title
from CODE.features.field_variety import field_variety2
#from CODE.features.field_variety import field_variety
from CODE.features.team_size import team_size
from CODE.features.topic_variety import topics_variety
from CODE.features.venue_frequency import venue_frequency
from CODE.features.age import age

from CODE.features.abst_words import abst_words




### Get the full train set:
data = pd.read_json('DATA/train-1.json')   # Numerical columns: 'year', 'references', 'citations'

### push the numerical columns to num_X
end = len(data)
num_X = data.loc[ 0:end+1 , ('doi', 'citations', 'year', 'references') ]


"""
FEATURE DATAFRAME: num_X

ALL: After writing a funtion to create a feature, please incorporate your new feature as a column on the dataframe below.
This is the dataframe we will use to train the models.
"""

### use feature function to create a new variable
title_len = length_title(data)      # returns: dictionary of lists: [doi](count)
field_var = field_variety2(data)    # returns: dictionary of lists: [doi](count)
team_sz = team_size(data)           # returns a numbered series
topic_var = topics_variety(data)    # returns a numbered series
venue_freq = venue_frequency(data)  # returns a dictionary: [venue](count)
paper_age = age(data)                     # returns a numbered series
open_access = pd.get_dummies(data["is_open_access"], drop_first = True)  # returns pd.df (True = 1)

keywords = ["method", "review", "randomized", "random control", "cancer"]
abst_keywords = abst_words(data, keywords)   #returns a numbered series: 1 if any of the words is present in the abstract, else 0


### join the variables (type = series) to num_X 
num_X['team_size'] = team_sz
num_X['topic_variety'] = topic_var
num_X['age'] = paper_age
num_X['open_access'] = open_access

num_X['has_keyword'] = abst_keywords


### join the variables (type = dictionary) to num_X
num_X['title_length'] = num_X['doi'].map(title_len)
num_X['field_variety'] = num_X['doi'].map(field_var)

### trainv/val split
X_train, X_val, y_train, y_val = split_val(num_X, target_variable = 'citations')




"""
IMPLEMENT model fuctions here
"""

'\nIMPLEMENT model fuctions here\n'

In [8]:
print(len(data))
print(type(abst_keywords))
print(abst_keywords)
num_X

9658
<class 'pandas.core.series.Series'>
0       1
1       0
2       0
3       0
4       0
       ..
9653    1
9654    1
9655    0
9656    0
9657    0
Length: 9658, dtype: int64


Unnamed: 0,doi,citations,year,references,team_size,topic_variety,age,open_access,has_keyword,title_length,field_variety
0,10.3115/v1/P15-1039,60,2015.0,39,6,1,6.0,1,1,10,1
1,10.18653/v1/2020.eval4nlp-1.12,1,2020.0,44,5,0,1.0,1,0,18,1
2,10.18653/v1/W17-3516,5,2017.0,30,3,5,4.0,1,0,8,1
3,10.18653/v1/S17-2160,5,2017.0,11,2,6,4.0,1,0,13,1
4,10.18653/v1/W15-2205,10,2015.0,26,2,23,6.0,1,0,5,1
...,...,...,...,...,...,...,...,...,...,...,...
9653,10.3115/v1/W14-0202,8,2014.0,25,4,11,7.0,1,1,10,1
9654,10.26615/978-954-452-058-8_001,1,2019.0,18,4,3,2.0,1,1,7,1
9655,10.18653/V1/2021.SMM4H-1.16,1,2021.0,12,2,0,0.0,0,0,16,1
9656,10.18653/v1/2021.case-1.22,3,2021.0,15,4,0,0.0,0,0,15,4


In [None]:
### FOR: exploring the scaffolding of the new dataframe for prediction as pulled from the full dataset

# print(type(data))
# print(list(data.columns))
print("X type:", type(num_X), "X shape:", num_X.shape)
data

In [None]:
### FOR: exploring the results of feature functions

print(type(title_len))
print(type(field_var))
print(type(team_sz))
print(type(topic_var))
print(type(venue_freq))
print(type(paper_age))
#title_len
#field_var
#team_sz
#topic_var
#venue_freq

In [6]:
### FOR: exploring the new dataframe with numerical columns

# from StackExchange:
# Never grow a DataFrame! It is always cheaper to append to a python list and then 
# convert it to a DataFrame at the end, both in terms of memory and performance.
# When appending to df, a new DataFrame is created each time in memory instead of 
# using the existing one, which is quite frankly a waste. It is always cheaper to 
# append to a python list and then convert it to a DataFrame at the end, both in 
# terms of memory and performance.

# --> NOTE: it would be more efficient to combine these first and only expand the df once (per addition type)

num_X


Unnamed: 0,doi,citations,year,references,team_size,topic_variety,age,open_access,has_keyword,title_length,field_variety
0,10.3115/v1/P15-1039,60,2015.0,39,6,1,6.0,1,1,10,1
1,10.18653/v1/2020.eval4nlp-1.12,1,2020.0,44,5,0,1.0,1,0,18,1
2,10.18653/v1/W17-3516,5,2017.0,30,3,5,4.0,1,0,8,1
3,10.18653/v1/S17-2160,5,2017.0,11,2,6,4.0,1,0,13,1
4,10.18653/v1/W15-2205,10,2015.0,26,2,23,6.0,1,0,5,1
...,...,...,...,...,...,...,...,...,...,...,...
9653,10.3115/v1/W14-0202,8,2014.0,25,4,11,7.0,1,1,10,1
9654,10.26615/978-954-452-058-8_001,1,2019.0,18,4,3,2.0,1,1,7,1
9655,10.18653/V1/2021.SMM4H-1.16,1,2021.0,12,2,0,0.0,0,0,16,1
9656,10.18653/v1/2021.case-1.22,3,2021.0,15,4,0,0.0,0,0,15,4


In [None]:
### FOR: explore data train/val split  (should be 6470 train rows and 3188 validation rows)

#X_train
#X_val
#y_train
#y_val


In [None]:
# Basic regression model Using any continuous variables
#     Establish data
#     Define model: regression model: sklearn.linear_model.LinearRegression
#     Fit model
#     Predict
#     Evaluate

from sklearn.linear_model import LinearRegression
model = LinearRegression()



# 1. z-score
reg = model.fit(X = X_train, y = y_train)  # 2. fit model
print("Model weights:", reg.coef_)
print("Model intercept/bias:", reg.intercept_)
y_pred_val = model.predict(X_val)  # 3. predict
a = r2_score(y_val, y_pred_val)  # 4. evaluate
b = mean_absolute_error(y_val, y_pred_val)


In [None]:
import pandas as pd
import numpy as np
%pwd
%cd C:\Users\r_noc\Desktop\GIT\machinelearning
    
play = pd.read_json('DATA/train-1.json')   # Numerical columns: 'year', 'references', 'citations'
play = play.iloc[0:10]
print(play.shape)
# print(play['abstract'])

print(list(play.columns))
# play['has_keyword'] = np.nan
# print(play.shape)
# play

In [None]:
def abst_words (the_data, keywords):
    abst = the_data['abstract']
    abst_key = []    
    
    for i in abst:
        if i == None:
            abst_key.append(0)
            continue
        else:
            flag = 0
            for word in keywords:
                if word in i:
                    flag = 1
            abst_key.append(flag)
    return pd.Series(abst_key)

keywords = ["method", "review", "randomized", "random control", "cancer"]
abst_words(play, keywords)
