In [None]:
import pandas as pd
from src.constants import JOURNALS_DF
from sklearn.model_selection import train_test_split
df = pd.read_pickle(JOURNALS_DF)
df.describe()

X = df["dc:description:keywords"].str.join(" ")
y = df["citedby-count"]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

pipe = Pipeline(steps = [
    ("tf", TfidfVectorizer()),
    ("reg", RandomForestRegressor())])

In [None]:
from sklearn.metrics import get_scorer

scorer = get_scorer("neg_mean_squared_error")
params = {"reg__max_depth": [4,5],
          "reg__n_estimators": [100, 200],
          #"reg__learning_rate": [0.1, 0.01, 0.001],
          "tf__max_features" : [1000, 2000, 3000, 5000, 1000],
          "tf__stop_words" : ["english"],
          "tf__ngram_range" : [(1,1),(1,2)],
          "tf__use_idf" : [True, False]
        }

gs = GridSearchCV(pipe, params, cv=2, n_jobs=1, scoring=scorer)

In [None]:
gs.fit(x_train, y_train)

In [None]:
gs.score(x_train, y_train)

In [None]:
gs.score(x_test, y_test)


In [None]:
results = pd.DataFrame(gs.cv_results_)

In [None]:
#################################

tfidf = TfidfVectorizer(max_features=10000)
X_train = tfidf.fit_transform(x_train)
model = RandomForestRegressor(n_jobs=10)
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import mean_squared_error
X_test = tfidf.transform(x_test)
y_hat_test = model.predict(X_test)

print(mean_squared_error(y_hat_test, y_test))

In [None]:
df_re = pd.DataFrame()
df_re["true"] = pd.Series(y_test)
df_re["predict"] = pd.Series(y_hat_test, index=df_re.index)

In [None]:
import seaborn as sb
sb.scatterplot(data=df_re, x="true", y="predict")

In [None]:
###
y_hat_train = model.predict(X_train)
df_re = pd.DataFrame()
df_re["true"] = pd.Series(y_train)
df_re["predict"] = pd.Series(y_hat_train, index=df_re.index)
sb.scatterplot(data=df_re, x="true", y="predict")

print(mean_squared_error(y_hat_train, y_train))