<a href="https://colab.research.google.com/github/lfekmf/Assignment1/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -----------------------
# 0. Настройки (замени под свой датасет)
# -----------------------
DATA_PATH = '/content/Coffe_sales.csv'   # <-- путь к файлу в Colab
TEXT_COLUMN = 'coffee_name'                     # <-- имя колонки с текстом
TARGET_COLUMN = 'money'                  # <-- имя колонки с целевой меткой

# -----------------------
# 1. Библиотеки и загрузка
# -----------------------
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from google.colab import files

# Загрузка
df = pd.read_csv(DATA_PATH)
print("shape:", df.shape)
display(df.head())

# -----------------------
# 2. Простая предобработка текста
# -----------------------
# Убрать Na, привести к str, простая очистка
df = df[[TEXT_COLUMN, TARGET_COLUMN]].dropna().copy()
def clean_text(s):
    s = str(s).lower()
    s = re.sub(r'\s+', ' ', s)
    s = re.sub(r'[^0-9a-zа-яёӘәӨөҮүІіҒғҚқҺһ\s]', ' ', s)  # допускаем латиницу + кириллицу + казахские буквы
    s = s.strip()
    return s

df[TEXT_COLUMN] = df[TEXT_COLUMN].apply(clean_text)
print("after dropna:", df.shape)

# -----------------------
# 3. Label encode target
# -----------------------
le = LabelEncoder()
y = le.fit_transform(df[TARGET_COLUMN])
num_targets = len(le.classes_)
print("Unique targets:", num_targets, le.classes_)

# -----------------------
# 4. TF-IDF векторизация
# -----------------------
# Если язык не английский — не указать stop_words='english'
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words=None)
X = tfidf.fit_transform(df[TEXT_COLUMN])
num_features = X.shape[1]
print("TF-IDF shape:", X.shape)

# -----------------------
# 5. Подготовим SVD для уменьшения размерности (нужно для KNN)
# -----------------------
n_components = min(300, num_features-1) if num_features>1 else 1
svd = TruncatedSVD(n_components=n_components, random_state=42)
# НЕ делаем fit_transform на всей выборке сразу для честности — будем фитить на train в цикле
print("SVD components:", n_components)

# -----------------------
# 6. Эксперименты: 5 итераций с разными train/test долями
# -----------------------
train_sizes = [0.8, 0.75, 0.7, 0.65, 0.6]   # 5 итераций (можно поменять)
random_states = [42, 7, 21, 99, 123]

results = []
iteration = 0

for iter_idx, (train_size, rs) in enumerate(zip(train_sizes, random_states), start=1):
    iteration += 1
    print(f"\n=== Iteration {iter_idx}: train_size={train_size} (rs={rs}) ===")
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=rs, stratify=y)
    # fit SVD on train only
    X_train_svd = svd.fit_transform(X_train)
    X_test_svd = svd.transform(X_test)

    # Use SVD features for KNN (dense). For tree/forest можно пробовать и исходный TF-IDF,
    # но чтобы сравнение было честным — используем SVD везде (меньше размерность, стабильней для KNN).
    Xtr = X_train_svd
    Xte = X_test_svd

    # --- Decision Tree
    dt = DecisionTreeClassifier(random_state=rs)
    dt.fit(Xtr, y_train)
    ypred_dt = dt.predict(Xte)
    acc_dt = accuracy_score(y_test, ypred_dt)
    prec_dt = precision_score(y_test, ypred_dt, average='weighted', zero_division=0)
    rec_dt = recall_score(y_test, ypred_dt, average='weighted', zero_division=0)
    f1_dt = f1_score(y_test, ypred_dt, average='weighted', zero_division=0)

    # --- Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=rs, n_jobs=-1)
    rf.fit(Xtr, y_train)
    ypred_rf = rf.predict(Xte)
    acc_rf = accuracy_score(y_test, ypred_rf)
    prec_rf = precision_score(y_test, ypred_rf, average='weighted', zero_division=0)
    rec_rf = recall_score(y_test, ypred_rf, average='weighted', zero_division=0)
    f1_rf = f1_score(y_test, ypred_rf, average='weighted', zero_division=0)

    # --- KNN
    knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
    knn.fit(Xtr, y_train)
    ypred_knn = knn.predict(Xte)
    acc_knn = accuracy_score(y_test, ypred_knn)
    prec_knn = precision_score(y_test, ypred_knn, average='weighted', zero_division=0)
    rec_knn = recall_score(y_test, ypred_knn, average='weighted', zero_division=0)
    f1_knn = f1_score(y_test, ypred_knn, average='weighted', zero_division=0)

    # Сохранение результатов (по каждой модели — отдельная строка)
    base_info = {
        'Iteration': iter_idx,
        'Num_features': num_features,
        'Num_targets': num_targets,
        'Train_size_%': int(train_size*100),
        'Test_size_%': int((1-train_size)*100)
    }
    results.append({**{'Algorithm':'Decision Tree'}, **base_info, **{'Accuracy':acc_dt,'Precision':prec_dt,'Recall':rec_dt,'F1':f1_dt}})
    results.append({**{'Algorithm':'Random Forest'}, **base_info, **{'Accuracy':acc_rf,'Precision':prec_rf,'Recall':rec_rf,'F1':f1_rf}})
    results.append({**{'Algorithm':'KNN'}, **base_info, **{'Accuracy':acc_knn,'Precision':prec_knn,'Recall':rec_knn,'F1':f1_knn}})

# Итоговая таблица
table1_df = pd.DataFrame(results)
display(table1_df)

# Сохранить результаты
table1_df.to_csv('table1_results.csv', index=False)
files.download('table1_results.csv')

shape: (3547, 11)


Unnamed: 0,hour_of_day,cash_type,money,coffee_name,Time_of_Day,Weekday,Month_name,Weekdaysort,Monthsort,Date,Time
0,10,card,38.7,Latte,Morning,Fri,Mar,5,3,2024-03-01,10:15:50.520000
1,12,card,38.7,Hot Chocolate,Afternoon,Fri,Mar,5,3,2024-03-01,12:19:22.539000
2,12,card,38.7,Hot Chocolate,Afternoon,Fri,Mar,5,3,2024-03-01,12:20:18.089000
3,13,card,28.9,Americano,Afternoon,Fri,Mar,5,3,2024-03-01,13:46:33.006000
4,13,card,38.7,Latte,Afternoon,Fri,Mar,5,3,2024-03-01,13:48:14.626000


after dropna: (3547, 2)
Unique targets: 13 [18.12 21.06 23.02 24.   25.96 27.92 28.9  30.86 32.82 33.8  35.76 37.72
 38.7 ]
TF-IDF shape: (3547, 13)
SVD components: 12

=== Iteration 1: train_size=0.8 (rs=42) ===

=== Iteration 2: train_size=0.75 (rs=7) ===

=== Iteration 3: train_size=0.7 (rs=21) ===

=== Iteration 4: train_size=0.65 (rs=99) ===

=== Iteration 5: train_size=0.6 (rs=123) ===


Unnamed: 0,Algorithm,Iteration,Num_features,Num_targets,Train_size_%,Test_size_%,Accuracy,Precision,Recall,F1
0,Decision Tree,1,13,13,80,19,0.542254,0.29498,0.542254,0.381834
1,Random Forest,1,13,13,80,19,0.542254,0.29498,0.542254,0.381834
2,KNN,1,13,13,80,19,0.477465,0.287406,0.477465,0.354838
3,Decision Tree,2,13,13,75,25,0.542277,0.296034,0.542277,0.382442
4,Random Forest,2,13,13,75,25,0.542277,0.296034,0.542277,0.382442
5,KNN,2,13,13,75,25,0.525366,0.315014,0.525366,0.390087
6,Decision Tree,3,13,13,70,30,0.542723,0.296014,0.542723,0.382668
7,Random Forest,3,13,13,70,30,0.542723,0.296014,0.542723,0.382668
8,KNN,3,13,13,70,30,0.369953,0.347352,0.369953,0.312691
9,Decision Tree,4,13,13,65,35,0.541868,0.295094,0.541868,0.381677


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import drive
drive.mount('/content/drive')