# 1. Обработка датасета

Считываем входные данные и задаем `target`

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
sample_df = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
target = train_df["target"]
train_df = train_df.drop(['target'], axis=1)

Unnamed: 0,id,keyword,location,text
0,1,,,Our Deeds are the Reason of this #earthquake M...
1,4,,,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...


Посчитаем Nan значения:

In [None]:
train_df.isna().sum()

id             0
keyword       61
location    2533
text           0
dtype: int64

Подготовим твиты в обеих выборках к анализу: нормализуем регистр и исключим лишние элементы (URL, @юзернеймы, хештеги и пунктуацию), используя regex

In [None]:
import re

def clean(text: str) -> str:
    text = str(text)
    text = text.lower()

    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = ' '.join(text.split())

    return text

train_df["text_clean"] = train_df["text"].apply(clean)
test_df["text_clean"] = test_df["text"].apply(clean)


Unnamed: 0,text,text_clean
0,Our Deeds are the Reason of this #earthquake M...,our deeds are the reason of this earthquake ma...
1,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,all residents asked to shelter in place are be...
3,"13,000 people receive #wildfires evacuation or...",13 000 people receive wildfires evacuation ord...
4,Just got sent this photo from Ruby #Alaska as ...,just got sent this photo from ruby alaska as s...


Заменим числа в текстах твитов на числительные:

In [None]:
import inflect

def numbers(text):
    p = inflect.engine()

    def replace_number(match):
        number_str  = match.group()

        clean_number = number_str.replace(' ', '')

        try:
            return p.number_to_words(clean_number)
        except:
            return number_str

    pattern = r'\b\d{1,3}(?:\s\d{3})+\b|\b\d+\b'
    result = re.sub(pattern, replace_number, text)

    return result

train_df["text_clean"] = train_df["text_clean"].apply(numbers)
test_df["text_clean"] = test_df["text_clean"].apply(numbers)

Unnamed: 0,text,text_clean
0,Our Deeds are the Reason of this #earthquake M...,our deeds are the reason of this earthquake ma...
1,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,all residents asked to shelter in place are be...
3,"13,000 people receive #wildfires evacuation or...",thirteen thousand people receive wildfires eva...
4,Just got sent this photo from Ruby #Alaska as ...,just got sent this photo from ruby alaska as s...


Приведём слова к нормальной форме

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()

def norma(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)


train_df['final_text'] = train_df['text_clean'].apply(norma)
test_df['final_text'] = test_df['text_clean'].apply(norma)
train_df.head()

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,keyword,location,text,text_clean,final_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,our deeds are the reason of this earthquake ma...,our deed are the reason of this earthquake may...
1,4,,,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask canada,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,all residents asked to shelter in place are be...,all resident asked to shelter in place are bei...
3,6,,,"13,000 people receive #wildfires evacuation or...",thirteen thousand people receive wildfires eva...,thirteen thousand people receive wildfire evac...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,just got sent this photo from ruby alaska as s...,just got sent this photo from ruby alaska a sm...


Теперь оцифруем тексты, рассчитав важность слов через TF-IDF. Важно: структуру весов мы определим по тренировочным данным, а затем на её основе построим векторы для теста

In [None]:
train_df = train_df.drop(['text', 'text_clean'], axis=1)
test_df = test_df.drop(['text', 'text_clean'], axis=1)

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

X_train = tfidf_vectorizer.fit_transform(train_df['final_text'])
X_test = tfidf_vectorizer.transform(test_df['final_text'])

(7613, 16859)

# 2. Обучение модели

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


classifiers = {
    'LogisticRegression': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'params': {
            'C': [0.1, 1.0, 10],
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear']
        }
    },
    'SVC': {
        'model': SVC(random_state=42),
        'params': {
            'C': [0.1, 1.0, 10],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']
        }
    }
}

Протестируем различные комбинации параметров для каждой модели. Для объективности разделим обучающие данные на три части: на двух будем обучаться, а на оставшейся — проверять результат

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

results = {}

for name, clf_info in classifiers.items():
    grid_search = GridSearchCV(
        estimator=clf_info['model'],
        param_grid=clf_info['params'],
        cv=3,
        scoring='f1',
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train, target)

    best_model = grid_search.best_estimator_
    best_score = grid_search.best_score_
    results[name] = {
        'model': best_model,
        'best_params': grid_search.best_params_,
        'best_f1': best_score
    }

best_classifier_name = max(results.keys(), key=lambda k: results[k]['best_f1'])
best_model = results[best_classifier_name]['model']

y_pred_test = best_model.predict(X_test)

submission = pd.DataFrame({
    'id': test_df['id'],
    'target': y_pred_test
})

submission.to_csv('submission.csv', index=False)