Building the dataset of numerical data

In [None]:
### PUT MAIN HERE ###

In [17]:
"""
SETUP
"""
### Import models
import pandas as pd
import numpy as np

### Import self-made functions
from CODE.data_preprocessing.split_val import split_val
from CODE.features.length_title import length_title
from CODE.features.field_variety import field_variety2
#from CODE.features.field_variety import field_variety
from CODE.features.team_size import team_size
from CODE.features.topic_variety import topics_variety
from CODE.features.venue_frequency import venue_frequency
from CODE.features.age import age

### Get the full train set:
data = pd.read_json('DATA/train-1.json')   # Numerical columns: 'year', 'references', 'citations'

### push the numerical columns to num_X
end = len(data)
num_X = data.loc[ 0:end+1 , ('doi', 'citations', 'year', 'references') ]


"""
FEATURE DATAFRAME: num_X

ALL: After writing a funtion to create a feature, please incorporate your new feature as a column on the dataframe below.
This is the dataframe we will use to train the models.
"""

### use feature function to create a new variable
title_len = length_title(data)      # returns: dictionary of lists: [doi](count)
field_var = field_variety2(data)    # returns: dictionary of lists: [doi](count)
team_sz = team_size(data)           # returns a numbered series
topic_var = topics_variety(data)    # returns a numbered series
venue_freq = venue_frequency(data)  # returns a dictionary: [venue](count)
paper_age = age(data)                     # returns a numbered series

### join the variables (type = series) to num_X 
num_X['team_size'] = team_sz
num_X['topic_variety'] = topic_var
num_X['age'] = paper_age

### join the variables (type = dictionary) to num_X
num_X['title_length'] = num_X['doi'].map(title_len)
num_X['field_variety'] = num_X['doi'].map(field_var)

### trainv/val split
X_train, X_val, y_train, y_val = split_val(num_X, target_variable = 'citations')




"""
INSERT split X and y on the train here
"""




"""
IMPLEMENT model fuctions here
"""

'\nIMPLEMENT model fuctions here\n'

In [6]:
### FOR: exploring the scaffolding of the new dataframe for prediction as pulled from the full dataset

# print(type(data))
# print(list(data.columns))
print("X type:", type(num_X), "X shape:", num_X.shape)
data

X type: <class 'pandas.core.frame.DataFrame'> X shape: (9658, 9)


Unnamed: 0,doi,title,abstract,authors,venue,year,references,topics,is_open_access,fields_of_study,citations
0,10.3115/v1/P15-1039,Generating High Quality Proposition Banks for ...,Semantic role labeling (SRL) is crucial to nat...,"[A. Akbik, Laura Chiticariu, Marina Danilevsky...",ACL,2015.0,39,[Semantic role labeling],True,[Computer Science],60
1,10.18653/v1/2020.eval4nlp-1.12,One of these words is not like the other: a re...,Word embeddings are an active topic in the NLP...,"[Jesper Brink Andersen, Mikkel Bak Bertelsen, ...",EVAL4NLP,2020.0,44,[],True,[Computer Science],1
2,10.18653/v1/W17-3516,The Code2Text Challenge: Text Generation in So...,We propose a new shared task for tactical data...,"[Kyle Richardson, Sina Zarrieß, Jonas Kuhn]",INLG,2017.0,30,"[Natural language generation, Library (computi...",True,[Computer Science],5
3,10.18653/v1/S17-2160,The Meaning Factory at SemEval-2017 Task 9: Pr...,We evaluate a semantic parser based on a chara...,"[Rik van Noord, Johan Bos]",SemEval@ACL,2017.0,11,"[Parsing, Convolutional neural network, Text-b...",True,[Computer Science],5
4,10.18653/v1/W15-2205,Semantic Parsing for Textual Entailment,In this paper we gauge the utility of general-...,"[Elisabeth Lien, Milen Kouylekov]",IWPT,2015.0,26,"[Textual entailment, Parsing, SemEval, Semanti...",True,[Computer Science],10
...,...,...,...,...,...,...,...,...,...,...,...
9653,10.3115/v1/W14-0202,IBM’s Belief Tracker: Results On Dialog State ...,Accurate dialog state tracking is crucial for ...,"[Rudolf Kadlec, Jindřich Libovický, Jan Macek,...",DM@EACL,2014.0,25,"[Dialog system, Discriminative model, Inclusio...",True,[Computer Science],8
9654,10.26615/978-954-452-058-8_001,RANLP 2019 Multilingual Headline Generation Ta...,The objective of the 2019 RANLP Multilingual H...,"[Marina Litvak, John M. Conroy, Peter A. Ranke...",,2019.0,18,"[Wikipedia, Heuristic, Information]",True,[Computer Science],1
9655,10.18653/V1/2021.SMM4H-1.16,A Joint Training Approach to Tweet Classificat...,In this work we describe our submissions to th...,"[Mohab Elkaref, L. Hassan]",SMM4H,2021.0,12,[],False,[Computer Science],1
9656,10.18653/v1/2021.case-1.22,Team “DaDeFrNi” at CASE 2021 Task 1: Document ...,This paper accompanies our top-performing subm...,"[Francesco Re, D. Végh, Dennis Atzenhofer, Nik...",CASE,2021.0,15,[],False,,3


In [8]:
### FOR: exploring the results of feature functions

print(type(title_len))
print(type(field_var))
print(type(team_sz))
print(type(topic_var))
print(type(venue_freq))
print(type(paper_age))
#title_len
#field_var
#team_sz
#topic_var
#venue_freq

<class 'dict'>
<class 'dict'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'dict'>
<class 'pandas.core.series.Series'>


In [9]:
### FOR: exploring the new dataframe with numerical columns

# from StackExchange:
# Never grow a DataFrame! It is always cheaper to append to a python list and then 
# convert it to a DataFrame at the end, both in terms of memory and performance.
# When appending to df, a new DataFrame is created each time in memory instead of 
# using the existing one, which is quite frankly a waste. It is always cheaper to 
# append to a python list and then convert it to a DataFrame at the end, both in 
# terms of memory and performance.

# --> NOTE: it would be more efficient to combine these first and only expand the df once (per addition type)

num_X


Unnamed: 0,doi,citations,year,references,team_size,topic_variety,age,title_length,field_variety
0,10.3115/v1/P15-1039,60,2015.0,39,6,1,6.0,10,1
1,10.18653/v1/2020.eval4nlp-1.12,1,2020.0,44,5,0,1.0,18,1
2,10.18653/v1/W17-3516,5,2017.0,30,3,5,4.0,8,1
3,10.18653/v1/S17-2160,5,2017.0,11,2,6,4.0,13,1
4,10.18653/v1/W15-2205,10,2015.0,26,2,23,6.0,5,1
...,...,...,...,...,...,...,...,...,...
9653,10.3115/v1/W14-0202,8,2014.0,25,4,11,7.0,10,1
9654,10.26615/978-954-452-058-8_001,1,2019.0,18,4,3,2.0,7,1
9655,10.18653/V1/2021.SMM4H-1.16,1,2021.0,12,2,0,0.0,16,1
9656,10.18653/v1/2021.case-1.22,3,2021.0,15,4,0,0.0,15,4


In [11]:

"""
INSERT split X and y on the X_train
"""


# num_X = data.loc[ 0:end+1 , ('doi', 'citations', 'year', 'references') ]


'\nINSERT split X and y on the train\n'

In [18]:
### FOR: explore data train/val split  (should be 6470 train rows and 3188 validation rows)

X_train
#X_val
# y_train
#y_val


Unnamed: 0,doi,year,references,team_size,topic_variety,age,title_length,field_variety
9263,10.3115/980491.980565,1984.0,6,1,12,37.0,4,1
7664,10.18653/v1/w19-3819,2019.0,19,1,6,2.0,11,2
9483,10.18653/v1/D15-1219,2015.0,38,1,7,6.0,7,1
378,10.18653/v1/2021.findings-acl.225,2021.0,27,2,0,0.0,9,1
3106,10.18653/v1/2020.nlpmc-1.6,2020.0,29,7,0,1.0,13,1
...,...,...,...,...,...,...,...,...
5734,10.18653/V1/2020.COLING-MAIN.541,2020.0,38,6,0,1.0,11,1
5191,10.18653/v1/P17-1152,2017.0,40,6,5,4.0,6,1
5390,10.1162/COLI_r_00106,2012.0,0,1,24,9.0,2,4
860,10.3115/980491.980599,1984.0,25,1,2,37.0,10,1


In [None]:
# Basic regression model Using any continuous variables
#     Establish data
#     Define model: regression model: sklearn.linear_model.LinearRegression
#     Fit model
#     Predict
#     Evaluate

from sklearn.linear_model import LinearRegression
model = LinearRegression()




reg = model.fit(X = X_train, y = y_train)  # 2. fit model
print("Model weights:", reg.coef_)
print("Model intercept/bias:", reg.intercept_)
y_pred_val = model.predict(X_val)  # 3. predict
a = r2_score(y_val, y_pred_val)  # 4. evaluate
b = mean_absolute_error(y_val, y_pred_val)
