In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
from utils import *

import pandas as pd; pd.set_option('precision', 4)
import numpy as np

from nltk.corpus import stopwords; nltk.download('stopwords')

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB as NB

from sklearn.metrics import classification_report, balanced_accuracy_score, f1_score

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jacobgdt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#twtc = pd.read_json('labelled_data.json', orient='split')
twtc = pd.read_csv('labelled.csv')
# orig_cols = twtc.columns

print(twtc.shape)
twtc.head()

(5824, 69)


Unnamed: 0,name,key_mlbam,key_fangraphs,key_bbref,key_bbref_minors,key_uuid,mlb_played_first,birthdate,debut_age,age,...,OPS,AB,SV,BB,W,SLG,eta_age,time_left,cutoff,label
0,**Luke Heimlich,-1,18360.0,,,,,,,22.3,...,0.8096,291.0,0.0,26.0,0.0,0.4639,23.4755,1.1755,2019.0,0
1,A.J. Alexy,669935,,,alexy-000aj-,c5bd95ff-593c-4875-b48e-012da0caca64,,1998-04-21T00:00:00.000Z,,18.7,...,0.0,0.0,0.0,0.0,1.0,0.0,21.3672,2.6672,2019.0,0
2,A.J. Cole,595918,11467.0,coleaj01,cole--001aj-,0596c185-ae62-436a-92b9-79b9a145d64d,2015.0,1992-01-05T00:00:00.000Z,23.0,22.0,...,0.0,8.0,0.0,0.0,6.0,0.0,23.4755,1.4755,2015.0,1
3,A.J. Cole,595918,11467.0,coleaj01,cole--001aj-,0596c185-ae62-436a-92b9-79b9a145d64d,2015.0,1992-01-05T00:00:00.000Z,23.0,24.0,...,0.3333,4.0,0.0,2.0,8.0,0.0,24.5296,0.5296,2016.0,1
4,A.J. Cole,595918,11467.0,coleaj01,cole--001aj-,0596c185-ae62-436a-92b9-79b9a145d64d,2015.0,1992-01-05T00:00:00.000Z,23.0,21.0,...,0.0,5.0,0.0,0.0,6.0,0.0,22.9484,1.9484,2014.0,0


### Merging

In [3]:
#df.fillna({'Level': 'AMTR', 'report': ''}, inplace=True)
df = twtc.copy()

df = df.drop(columns=[c for c in dropped_cols if c in df.columns])
df = df.filter(regex='^(?!key)\w+$', axis='columns')

df = df.reset_index(drop=True)

df = onehot_encode_column(df, 'primary_position')
df = onehot_encode_column(df, 'Level')

#df = df.dropna()

# df['old_report'] = df['report'].copy()

df.report = list(apply_text_mask(df.report, processes=6))
df.report = df.report.str.replace('(\d+)', 'NUMBER', regex=True)

#df = df[df.age != '']

print(df.shape)
df.head()


(5824, 74)


Unnamed: 0,name,age,report,Arm,Changeup,Control,Curveball,Cutter,Fastball,Field,...,pos_RF,pos_RHP,pos_SS,pos_A,pos_A+,pos_A-,pos_AA,pos_AAA,pos_R,pos_UNK
0,**Luke Heimlich,22.3,PERSON is a Level NUMBER sex offender and woul...,0,55,55,55,0,55,0,...,0,0,0,0,0,0,0,1,0,0
1,A.J. Alexy,18.7,PERSON made headlines for all the wrong reason...,0,50,50,55,0,55,0,...,0,1,0,0,0,0,0,0,1,0
2,A.J. Cole,22.0,"The ORGANIZATION have acquired PERSON twice, f...",0,55,55,0,0,70,0,...,0,1,0,0,0,0,0,1,0,0
3,A.J. Cole,24.0,Signed for an above-slot $NUMBER million as a ...,0,55,55,45,0,55,0,...,0,1,0,0,0,0,0,1,0,0
4,A.J. Cole,21.0,"It often takes time for those high-ceilinged, ...",0,50,50,0,0,70,0,...,0,1,0,0,0,0,1,0,0,0


In [None]:
df.to_csv('preprocessed.csv', index=False)

## `Sk-learn` Modeling

In [19]:
from sklearn.svm import SVC

In [20]:
CLFS = {
    'lr': lambda: LogisticRegression(max_iter=2000, solver='lbfgs'), # C=1e-2
    'svm': lambda: SVC(), #LinearSVC(max_iter=7500),
    'nb':  lambda: NB(),
    'knn': lambda: KNeighborsClassifier(10),
    'rf': lambda: RandomForestClassifier(n_estimators=500),
    'nn': lambda: MLPClassifier(alpha=1e-2, max_iter=1000, early_stopping=True, hidden_layer_sizes=(300,)),
    'ada': lambda: AdaBoostClassifier(),
    'sgd': lambda: SGDClassifier(max_iter=1000, tol=1e3),
    'gb': GradientBoostingClassifier,
}

voters = ['rf', 'nn', 'sgd', 'svm']
CLFS['vote'] = lambda: VotingClassifier([(v, CLFS[v]) for v in voters])

In [7]:
FEATURE_COLS = ['age', 'Arm', 'Changeup', 'Control', 'Curveball',
       'Cutter', 'Fastball', 'Field', 'Hit', 'Power', 'Run', 'Slider',
       'Splitter', 'OBP', 'CS', 'WHIP', 'HR', 'ERA', 'SO',
       'H', 'SO_pit', 'L', 'R', 'RBI', '3B', 'BS', 'GP', 'TBF',
       '2B', 'AVG', 'HR_pit', 'IP', 'SB', 'Hld', 'Pitches', 'BB_pit', 'OPS',
       'AB', 'SV', 'BB', 'W', 'SLG',
       'pos_1B', 'pos_2B', 'pos_3B', 'pos_C', 'pos_CF', 'pos_INF', 'pos_LF',
       'pos_LHP', 'pos_OF', 'pos_RF', 'pos_RHP', 'pos_SS', 'pos_A', 'pos_A+',
       'pos_A-', 'pos_AA', 'pos_AAA', 'pos_R', 'pos_UNK']

In [8]:
from sklearn.preprocessing import PolynomialFeatures

def fit_pipeline(X_train, y_train, featurizer='union', clf='lr', **kwargs):
    tfidf_pipe = Pipeline([
        ('report_tfidf', Pipeline([
            ('selector', ItemSelector(key='report')),
            ('tfidf', TfidfVectorizer(
                #max_features=10000, 
                strip_accents='unicode',
                **kwargs
            )),
        ]))
    ])

    meta = Pipeline([
        ('selector', ItemSelector(key=FEATURE_COLS)),
        ('feature_interactions', PolynomialFeatures(interaction_only=True)),
        ('impute', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
        ('scale', StandardScaler(with_mean=False))
    ])

    features = None
    if featurizer == 'union':
        features = FeatureUnion([
            ('metadata', meta),
            ('tfidf', tfidf_pipe)
        ])
    elif featurizer == 'tfidf':
        features = tfidf_pipe
    elif featurizer == 'metadata':
        features = meta
    else:
        raise ValueError(f'Invalid featurizer: {featurizer}')

    clf_model = CLFS[clf]()
    print(f'Training {type(clf_model).__name__} with {featurizer} features on {X_train.shape} training set.')

    pipe = Pipeline([
        ('featurizer', features),
        
        ('clf', clf_model)
    ]).fit(X_train, y_train)

    return pipe

In [9]:
def top_features(clf, fts, coef=None, order='top', n=10):
    if coef is None:
        if not hasattr(clf, 'coef_'):
            print(f'Model {type(clf).__name__} has no coefficients. Please use a linear model.')
            return
        coef = clf.coef_.reshape(-1)
    for f, c in sorted(zip(fts, coef), key=lambda x: x[1], reverse=(order == 'top'))[:n]:
        print(f'{f}: {c:.4f}')

In [None]:
train_df, test_df = split_datasets(df, return_dfs=True, rebalance_pct=0.25)#0.65)

polyfts = Pipeline([
        ('selector', ItemSelector(key=FEATURE_COLS)),
        ('feature_interactions', PolynomialFeatures(interaction_only=True))
]).fit(df)

poly_train = pd.DataFrame(polyfts.transform(train_df))
poly_test = pd.DataFrame(polyfts.transform(test_df))

pd.concat([train_df.reset_index(drop=True), poly_train], axis=1).to_csv('../twtc/train.csv', index=False)
pd.concat([test_df.reset_index(drop=True), poly_test], axis=1).to_csv('../twtc/test.csv', index=False)

In [10]:


#from nltk import word_tokenize


def split_datasets(df_, return_dfs=False, rebalance_pct=None):
    df = df_.copy()
    
    """
    if rebalance_pct:
        upsampled = df[df.label == 1].sample(frac=rebalance_pct)
        df = pd.concat([df, upsampled], axis=0)
    
    y = df['label']
    X = df.drop(columns=['name', 'label', 'name_count', 'clean_name', 'mlbam_candidate', 'old_mlbam',
           'fg_season_id', 'am_season_id', 'eta_age', 'time_left', 'cutoff', 'Age_pit'])
    
    return train_test_split(X, y, test_size=0.1, random_state=42)
    """
    train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
    if rebalance_pct:
        upsampled = train_df[train_df.label == 1].sample(frac=rebalance_pct, replace=True)
        train_df = pd.concat([train_df, upsampled], axis=0)
    
    if return_dfs:
        return train_df, test_df
    
    cols = ['name', 'label', 'name_count', 'clean_name', 'mlbam_candidate', 'old_mlbam',
           'fg_season_id', 'am_season_id', 'eta_age', 'time_left', 'cutoff', 'Age_pit']
    
    return train_df.drop(columns=cols), test_df.drop(columns=cols), train_df['label'], test_df['label']
    
    

X_train, X_test, y_train, y_test = split_datasets(df, rebalance_pct=None)
print(y_train.value_counts() / len(y_train))
print(y_test.value_counts() / len(y_test))
# X_train.head()

0    0.8021
1    0.1979
Name: label, dtype: float64
0    0.8027
1    0.1973
Name: label, dtype: float64


In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

sw = {'number', 'gpe', 'person', 'organization', 'numberth'}
#sw.update(stopwords.words('english'))

clf = Pipeline([
    ('vect',CountVectorizer(min_df=0.00, ngram_range=(1,3), stop_words=sw)),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC(class_weight={1: 0.85})) # CLFS['svm']()
])



clf.fit(X_train.report, y_train)
preds = clf.predict(X_test.report)
print(classification_report(y_test, preds))

In [None]:
clf = fit_pipeline(X_train, y_train, 'metadata', 'svm')
print(classification_report(y_test, clf.predict(X_test)))

top_features(clf.get_params()['clf'], [c for c in X.columns if c != 'report'])

In [None]:
feature_names = clf.get_params()['vect'].get_feature_names()
[feature_names[i] for i in clf.get_params()['clf'].coef_[0].argsort()[-10:][::-1]]
[feature_names[i] for i in clf.get_params()['clf'].coef_[0].argsort()[:10]]

In [None]:
clf = fit_pipeline(X_train, y_train, 'tfidf', 'lr', 
                   max_df=0.8,
                   min_df=0.0,#25, 
                   ngram_range=(1, 4), 
                   #stop_words={'number', 'gpe', 'person'},
                   max_features=2000
                  )
print(classification_report(y_test, clf.predict(X_test)))

clf_tfidf = clf.get_params()['featurizer'].get_params()['report_tfidf'].get_params()['tfidf']
#top_features(None, clf_tfidf.get_feature_names(), clf_tfidf.idf_, n=15)

In [21]:
clf = fit_pipeline(X_train, y_train, 'union', 'svm', max_df=0.8, min_df=0.05)
print(classification_report(y_test, clf.predict(X_test)))

#top_features(clf.get_params()['clf'], [c for c in X.columns if c != 'report'])

Training SVC with union features on (5241, 62) training set.




              precision    recall  f1-score   support

           0       0.82      0.97      0.89       468
           1       0.59      0.15      0.24       115

    accuracy                           0.81       583
   macro avg       0.70      0.56      0.56       583
weighted avg       0.78      0.81      0.76       583



In [4]:
import torch

In [38]:
roberta = torch.hub.load('pytorch/fairseq', 'roberta.base')
roberta.eval()  # disable dropout (or leave in train mode to finetune)

Using cache found in /home/jacobgdt/.cache/torch/hub/pytorch_fairseq_master
100%|██████████| 231160875/231160875 [03:17<00:00, 1170181.49B/s]


loading archive file http://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz from cache at /home/jacobgdt/.cache/torch/pytorch_fairseq/37d2bc14cf6332d61ed5abeb579948e6054e46cc724c7d23426382d11a31b2d6.ae5852b4abc6bf762e0b6b30f19e741aa05562471e9eb8f4a6ae261f04f9b350
extracting archive file /home/jacobgdt/.cache/torch/pytorch_fairseq/37d2bc14cf6332d61ed5abeb579948e6054e46cc724c7d23426382d11a31b2d6.ae5852b4abc6bf762e0b6b30f19e741aa05562471e9eb8f4a6ae261f04f9b350 to temp dir /tmp/tmpkqlorag7
| dictionary: 50264 types


RobertaHubInterface(
  (model): RobertaModel(
    (decoder): RobertaEncoder(
      (sentence_encoder): TransformerSentenceEncoder(
        (embed_tokens): Embedding(50265, 768, padding_idx=1)
        (embed_positions): LearnedPositionalEmbedding(514, 768, padding_idx=1)
        (layers): ModuleList(
          (0): TransformerSentenceEncoderLayer(
            (self_attn): MultiheadAttention(
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
            (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          )
          (1): TransformerSentenceEncoderLayer(
            (self_attn): MultiheadAttention(
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
       

In [39]:
roberta.cuda()

RuntimeError: CUDA out of memory. Tried to allocate 148.00 MiB (GPU 0; 3.94 GiB total capacity; 1.16 GiB already allocated; 14.25 MiB free; 4.59 MiB cached)

In [31]:
def roberta_featurizer(txt):
    tokens = roberta.encode(txt)
    features = roberta.extract_features(tokens)
    
    return features.mean(dim=1).squeeze(0)

In [35]:
#embedded_reports = list(tqdm_parallel(roberta_featurizer, df.report.sample(100), processes=2))
[roberta_featurizer(t) for t in df.report.sample(10)]

KeyboardInterrupt: 

In [33]:
len(embedded_reports)

TypeError: object of type 'generator' has no len()