In [49]:
#%pip install spacy

In [1]:
import requests
import seaborn as sns
import pandas as pd
import time
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score, roc_curve
from nltk import WordNetLemmatizer
import spacy

In [2]:
with open('../data/pickledcooking.pkl','rb') as x:
    df_cook = pickle.load(x)

In [3]:
with open('../data/pickledbeginners.pkl','rb') as x:
    df_cforb = pickle.load(x)

In [4]:
df_cook['sub'] = 'r/Cooking'

df_cforb['sub'] = 'r/Cookingforbeginners'

In [5]:
textdata = pd.concat([df_cook[['title','sub']],df_cforb[['title','sub']]])

In [6]:
X = textdata['title']
y = textdata['sub']

In [7]:
X.value_counts(normalize=True,ascending=False).head()

Culinary Arts School Delhi - Diploma Courses in Culinary Arts    0.000415
YWB mez-Scotty Pippin                                            0.000208
A dumb question about leaving dough out                          0.000161
Help!                                                            0.000138
Where do I start?                                                0.000138
Name: title, dtype: float64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=39,stratify=y)

In [29]:
pipe1 = Pipeline([
    ('cv',CountVectorizer(strip_accents='unicode',ngram_range=(1,1),max_features=None)),
    ('logreg',LogisticRegression(penalty='l2',max_iter=1000))#just calling some of these so you keep them in mind
])

In [30]:
pipe1.fit(X_train,y_train)
pipe1.score(X_train,y_train), pipe1.score(X_test,y_test)

(0.7621186023622047, 0.6118287506920096)

In [41]:
pipe3 = Pipeline([
    ('cv',CountVectorizer(strip_accents='ascii',max_features=1000,max_df=0.8)),
    ('abc',AdaBoostClassifier(n_estimators=3000,random_state=815))
])

In [32]:
pipe3.fit(X_train,y_train)
pipe3.score(X_train,y_train), pipe3.score(X_test,y_test)

(0.6574495570866141, 0.6121978224764717)

In [33]:
cv = CountVectorizer(strip_accents='unicode',ngram_range=(1,1),max_features=None)
X_enc = cv.fit_transform(X_train,y_train)

In [34]:
X_enc

<32512x15752 sparse matrix of type '<class 'numpy.int64'>'
	with 272165 stored elements in Compressed Sparse Row format>

In [35]:
logreg = LogisticRegression(max_iter=10_000)
logreg.fit(X_enc,y_train)

LogisticRegression(max_iter=10000)

In [36]:
abc = AdaBoostClassifier(n_estimators=500,random_state=815)
abc.fit(X_enc,y_train)

AdaBoostClassifier(n_estimators=500, random_state=815)

In [37]:
abc.score(X_enc,y_train)

0.6503752460629921

In [38]:
pd.DataFrame(abc.feature_importances_,index=cv.get_feature_names_out(),columns=['ft_imp']).sort_values(by='ft_imp',ascending=False).head(25)

Unnamed: 0,ft_imp
this,0.004
how,0.004
of,0.004
what,0.004
cookies,0.004
frozen,0.004
paratha,0.004
your,0.004
and,0.004
healthy,0.004


In [39]:
tfidf = TfidfVectorizer(strip_accents='ascii',stop_words='english',ngram_range=(1,2),min_df=5,max_df=0.8,norm='l1')
tfidf.fit(X_train,y_train)
X_enctoo = tfidf.transform(X_train)

In [40]:
abctoo = AdaBoostClassifier(n_estimators=500,random_state=815)
abctoo.fit(X_enctoo,y_train)


AdaBoostClassifier(n_estimators=500, random_state=815)

In [41]:
pd.DataFrame(abctoo.feature_importances_,index=tfidf.get_feature_names_out(),columns=['ft_imp']).sort_values(by='ft_imp',ascending=False).head(25)

Unnamed: 0,ft_imp
looking,0.004
scratch,0.004
amp,0.004
learn,0.004
street,0.004
cook,0.004
sweet potatoes,0.004
frozen,0.004
thing,0.004
cooking,0.004


In [42]:
abctoo.score(tfidf.transform(X_test),y_test)

0.5988189702897213

In [10]:
nlp = spacy.load("en_core_web_trf")

In [18]:
doc = nlp('I am becoming a better human being')

In [44]:
X_trlemma = []
for i in range(len(X_train)):
    X_trlemma.append(' '.join([w.lemma_ for w in nlp(X_train.iloc[i])]))
    



KeyboardInterrupt: 

In [35]:
' '.join([w.lemma_ for w in nlp(X_train.iloc[1])])

'what be your favourite kind of cheese , and your favourite way to use it ?'

In [34]:
X_train.iloc[0]

'Easiest Shakshouka ever! Even I can make it!'

In [None]:
pipe4= Pipeline([
    ('cv',CountVectorizer(strip_accents='ascii',tokenizer=WordNetLemmatizer()))
])