# Avito Category Prediction
 You need to predict the category of avito publications

In [1]:
import pandas as pd

In [None]:
train = pd.read_csv('avito-text-classification/train.csv')
test = pd.read_csv('avito-text-classification/test.csv')

In [None]:
train.head()

In [None]:
train.fillna('', inplace=True)
test.fillna('', inplace=True)

In [None]:
train['text'] = train['title'] + ' ' + train['description']
train.drop(['title', 'description'], axis=1, inplace=True)

In [None]:
test['text'] = test['title'] + ' ' + test['description']
test.drop(['title', 'description'], axis=1, inplace=True)

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from string import punctuation, digits

def tokenizer(s: str) -> list:
    """
    My custom tokenizer using sets
    """
    
    stemmer = SnowballStemmer('russian')
    compare = set(punctuation) | set(digits) | set(stopwords.words('russian'))
    result = [stemmer.stem(x) for x in word_tokenize(s.lower()) if
              len(x) > 2 and len({x} & compare) == 0]  
    return result

In [None]:
# 1
random_data = train.sample(500000)

In [None]:
random_data

In [None]:
# 2
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train['text'], train['Category'], test_size=0.3, random_state=2021)

In [None]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report,accuracy_score

sgd = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenizer)),
    ('clf', SGDClassifier(random_state=2021, n_jobs=8)),
    ])

sgd.fit(train['text'], train['Category'])

sgd_y_pred = sgd.predict(X_test)


# classification_report(y_test, sgd_y_pred)
accuracy_score(y_test, sgd_y_pred)

In [None]:
accuracy_score(train['Category'], sgd.predict(train['text']))


In [None]:
print(classification_report(train['Category'], sgd.predict(train['text'])))

In [None]:
output = pd.DataFrame({
    'Id': test.itemid, 
    'Category': sgd.predict(test['text'])
})
output.to_csv("submission.csv", index=False)
output.head()

In [None]:
%%time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
# from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,accuracy_score

grid_params = {
    'n_neighbors': [1,2,3,4,5,6],
    'weights': ['uniform', 'distance'],
    'metric':['euclidean', 'manhattan']
}


pipe = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenizer)),
    ('clf', GridSearchCV(
        KNeighborsClassifier(),
        grid_params,
        verbose=1,
        cv=3,
        n_jobs=-1)),
    ])

pipe.fit(X_train, y_train)

pipe_y_pred = pipe.predict(X_test)


# classification_report(y_test, sgd_y_pred)
accuracy_score(y_test, pipe_y_pred)

In [None]:
print("test:", accuracy_score(y_test, pipe_y_pred))

print(classification_report(y_test, pipe_y_pred))

In [None]:
output = pd.DataFrame({
    'Id': test.itemid, 
    'Category': sgd.predict(test['text'])
})
output.to_csv("submission.csv", index=False)
output.head()