# Modeling

In [53]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)
warnings.filterwarnings("ignore", category = FutureWarning)
warnings.filterwarnings("ignore", category = UserWarning)

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize, RegexpTokenizer
from nltk.stem import PorterStemmer, snowball
from nltk.corpus import stopwords
from nltk import FreqDist
from wordcloud import WordCloud
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.util import ngrams
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder 
from nltk.util import ngrams

nltk.download('stopwords')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier
from imblearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE, RandomOverSampler
 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\capta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_parquet('./Data/processed_reports.parquet')

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4209 entries, 0 to 5744
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   player            4209 non-null   object 
 1   report            4209 non-null   object 
 2   year              4209 non-null   float64
 3   pos_x             3985 non-null   object 
 4   weight            3986 non-null   float64
 5   height            3986 non-null   float64
 6   pos_rk            3871 non-null   float64
 7   ovr_rk            3292 non-null   float64
 8   grade             3877 non-null   float64
 9   pos_y             3980 non-null   object 
 10  age               3979 non-null   object 
 11  Round             4209 non-null   Int64  
 12  Pick              4209 non-null   Int64  
 13  target            4209 non-null   Int64  
 14  processed_report  4209 non-null   object 
 15  joined_report     4209 non-null   object 
 16  reportlen         4209 non-null   int64  


In [25]:
tf = TfidfVectorizer(ngram_range=(1,2),min_df=.005,max_df =.9)

In [26]:
X = tf.fit_transform(df.joined_report)

In [27]:
report_df = pd.DataFrame(X.toarray(),columns=tf.get_feature_names())

In [40]:
df.reset_index(inplace=True)

In [42]:
report_df['Round'] = df.Round

In [45]:
report_df

Unnamed: 0,100,2000,2001,2001 play,2001 start,2002,2002 play,2002 season,2002 start,2003,...,year total,yet,zero,zone,zone block,zone blocker,zone coverag,zone heavi,zone look,Round
0,0.0,0.0,0.000000,0.0,0.0,0.043125,0.0,0.0,0.0,0.075587,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,1
1,0.0,0.0,0.000000,0.0,0.0,0.054929,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,1
2,0.0,0.0,0.000000,0.0,0.0,0.088096,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,1
3,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.061153,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,1
4,0.0,0.0,0.101247,0.0,0.0,0.085514,0.0,0.0,0.0,0.074943,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,7
4205,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,7
4206,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,7
4207,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.111233,0.0,0.0,0.0,0.187635,0.0,7


In [77]:
X = report_df.drop(['Round'],axis=1)
y = report_df.Round

In [78]:
report_df.Round

0       1
1       1
2       1
3       1
4       1
       ..
4204    7
4205    7
4206    7
4207    7
4208    7
Name: Round, Length: 4209, dtype: int32

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.33,random_state=42)

In [85]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
y_train = label_encoder.fit_transform(y_train)

In [87]:
y_train

array([5, 3, 6, ..., 3, 1, 6])

In [84]:
report_df.Round

0       1
1       1
2       1
3       1
4       1
       ..
4204    7
4205    7
4206    7
4207    7
4208    7
Name: Round, Length: 4209, dtype: int32

In [79]:
pipe = Pipeline([
    ('sample',None),
    ('tree',DecisionTreeClassifier(random_state=42))
])

grid = {
    'sample':[RandomOverSampler(random_state=42),SMOTE(random_state=42)]
}

In [80]:
gridsearch = GridSearchCV(estimator=pipe,
                          param_grid=grid,
                          cv=5)

In [76]:
report_df['Round'] = report_df.Round.astype('int')

In [82]:
from sklearn.utils import multiclass
print(multiclass.type_of_target(y_train))

unknown


In [88]:
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train,y_train)
y_preds = tree.predict(X_test)

In [60]:
gridsearch.fit(X_train,y_train)
y_pred = gridsearch.predict(X_test)

10 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\capta\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\capta\anaconda3\lib\site-packages\imblearn\pipeline.py", line 293, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\capta\anaconda3\lib\site-packages\imblearn\pipeline.py", line 250, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "C:\Users\capta\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs

ValueError: Unknown label type: 'unknown'