In [9]:
!pip install nltk
!pip install gensim
!pip install scikit-learn
!pip install seaborn


Collecting gensim
  Using cached gensim-4.3.3.tar.gz (23.3 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Using cached numpy-1.26.4.tar.gz (15.8 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): still running...
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting scipy<1.14.0,>=

  error: subprocess-exited-with-error
  
  Preparing metadata (pyproject.toml) did not run successfully.
  exit code: 1
  
  [47 lines of output]
  + meson setup C:\Users\uula2\AppData\Local\Temp\pip-install-yn_fv6ad\scipy_e2a8e4ec96ba410892814011192d7fe9 C:\Users\uula2\AppData\Local\Temp\pip-install-yn_fv6ad\scipy_e2a8e4ec96ba410892814011192d7fe9\.mesonpy-bla_6ipr -Dbuildtype=release -Db_ndebug=if-release -Db_vscrt=md --native-file=C:\Users\uula2\AppData\Local\Temp\pip-install-yn_fv6ad\scipy_e2a8e4ec96ba410892814011192d7fe9\.mesonpy-bla_6ipr\meson-python-native-file.ini
  The Meson build system
  Version: 1.9.1
  Source dir: C:\Users\uula2\AppData\Local\Temp\pip-install-yn_fv6ad\scipy_e2a8e4ec96ba410892814011192d7fe9
  Build dir: C:\Users\uula2\AppData\Local\Temp\pip-install-yn_fv6ad\scipy_e2a8e4ec96ba410892814011192d7fe9\.mesonpy-bla_6ipr
  Build type: native build
  Activating VS 17.14.16
  Project name: scipy
  Project version: 1.13.1
  C compiler for the host machine: cl (msvc 19.



In [10]:
# Базові бібліотеки
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# NLP та обробка тексту
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Word2Vec
from gensim.models import Word2Vec

# Машинне навчання
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Моделі
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Ініціалізація nltk
nltk.download('punkt')
nltk.download('stopwords')


ModuleNotFoundError: No module named 'gensim'

In [None]:
# Завантаження даних
train_df = pd.read_csv("Corona_NLP_train.csv")
test_df = pd.read_csv("Corona_NLP_test.csv")

# Перевірка колонок
print(train_df.columns)
train_df.head()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Приведення до нижнього регістру
    text = text.lower()
    # Видалення URL, @username, #hashtags, цифр та спецсимволів
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^a-z\s]', '', text)
    # Токенізація
    tokens = word_tokenize(text)
    # Видалення стоп-слів
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Обробка тексту
train_df['tokens'] = train_df['OriginalTweet'].apply(preprocess_text)
test_df['tokens'] = test_df['OriginalTweet'].apply(preprocess_text)



In [None]:
# Навчання Word2Vec
w2v_model = Word2Vec(sentences=train_df['tokens'], vector_size=100, window=5, min_count=2, workers=4, sg=1)

# Функція для отримання вектора твітів
def tweet_vector(tokens, model, vector_size):
    vec = np.zeros(vector_size)
    count = 0
    for word in tokens:
        if word in model.wv:
            vec += model.wv[word]
            count += 1
    if count > 0:
        vec /= count
    return vec

# Генерація векторів для тренувальної та тестової вибірки
vector_size = 100
X_train = np.array([tweet_vector(x, w2v_model, vector_size) for x in train_df['tokens']])
X_test = np.array([tweet_vector(x, w2v_model, vector_size) for x in test_df['tokens']])

# Мітки
le = LabelEncoder()
y_train = le.fit_transform(train_df['Sentiment'])
y_test = le.transform(test_df['Sentiment'])


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Naive Bayes": GaussianNB()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'F1-score': f1_score(y_test, y_pred, average='weighted')
    }

# Таблиця результатів
results_df = pd.DataFrame(results).T
results_df


In [None]:
pca_components = [50, 100, 200]
pca_results = {}

for n in pca_components:
    pca = PCA(n_components=n)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    
    for name, model in models.items():
        model.fit(X_train_pca, y_train)
        y_pred = model.predict(X_test_pca)
        
        pca_results[(name, n)] = {
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred, average='weighted'),
            'Recall': recall_score(y_test, y_pred, average='weighted'),
            'F1-score': f1_score(y_test, y_pred, average='weighted')
        }

# Таблиця результатів PCA
pca_results_df = pd.DataFrame(pca_results).T
pca_results_df


In [None]:
# Наприклад, виберемо Logistic Regression без PCA
best_model = LogisticRegression(max_iter=500)
best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred_best)
sns.heatmap(cm, annot=True, fmt='d', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Logistic Regression")
plt.show()
