# Wstęp do Uczenia Maszynowego - Projekt 2
## Etap: Drugi Kamień Milowy 
### Autorzy: Krzysztof Osiński, Jakub Miszczak

In [None]:
import pandas as pd
import numpy as np
import sklearn 
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import warnings
warnings.filterwarnings('ignore')
np.random.seed(23)
import zipfile

## Wczytanie zbioru danych

In [None]:
zip_path = "ecommerceDataset.csv.zip"

with zipfile.ZipFile(zip_path, 'r') as z:
    with z.open("ecommerceDataset.csv") as file:
        df = pd.read_csv(file, header=None)
        
df.columns = ['category', 'description'] 

## Podstawowy stemming i lematyzacja

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


def preprocess_text(text):
    tokens = text.lower().split()  # zamiast word_tokenize
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

df['clean_text'] = df['description'].apply(preprocess_text)

## TF-IDF na oczyszczonych tekstach

In [None]:
tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2), stop_words='english')
X_tfidf = tfidf.fit_transform(df['clean_text'])


# Najważniejsze cechy wg tf-idf
feature_array = tfidf.get_feature_names_out()
tfidf_sorting = X_tfidf.toarray().sum(axis=0).argsort()[::-1]
top_n = 20
top_features = feature_array[tfidf_sorting][:top_n]

# Wykres
plt.figure(figsize=(10,5))
sns.barplot(x=top_features, y=X_tfidf.toarray().sum(axis=0)[tfidf_sorting][:top_n])
plt.xticks(rotation=45)
plt.title('Top cechy wg TF-IDF')
plt.show()

## t-SNE

In [None]:
# from sklearn.manifold import TSNE
#
# tsne = TSNE(n_components=2, perplexity=30, n_iter=1000, random_state=42)
# X_tsne = tsne.fit_transform(X_tfidf.toarray())
#
# plt.figure(figsize=(10, 6))
# plt.scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.5, s=10)
# plt.title("Redukcja wymiarów za pomocą t-SNE")
# plt.xlabel("Wymiar 1")
# plt.ylabel("Wymiar 2")
# plt.show()

## Word2Vec

In [None]:
from gensim.models import Word2Vec

# tokenizacja
tokenized = df['clean_text'].apply(lambda x: x.split())

# wstępne trenowanie Word2Vec
w2v_model = Word2Vec(sentences=tokenized, vector_size=100, window=5, min_count=2, workers=4)

def get_mean_embedding(doc):
    words = [word for word in doc if word in w2v_model.wv]
    if not words:
        return np.zeros(100)
    return np.mean(w2v_model.wv[words], axis=0)

X_w2v = np.array([get_mean_embedding(doc) for doc in tokenized])

print(X_w2v.shape)
print(X_w2v[:5])