In [10]:
# === 1️⃣ Bibliotheken importieren ===
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# === 2️⃣ Datensatz laden ===
DATA_PATH = "../Data/tweets_preprocessed_train.parquet" 

import os

print("Arbeitsverzeichnis:", os.getcwd())
print("Datei existiert?", os.path.exists(DATA_PATH))


df = pd.read_parquet(DATA_PATH)
print("✅ Datensatz geladen!")
print("Größe:", df.shape)
print("Spalten:", df.columns.tolist())
print(df.head())

# === 3️⃣ Vokabular laden ===
# Falls du dein TOP_VOCABULARY bereits gespeichert hast:
TOP_VOCAB_PATH = "../Data/top_1000_words.csv"
TOP_VOCABULARY = pd.read_csv(TOP_VOCAB_PATH)["word"].tolist()
print(f"✅ TOP_VOCABULARY geladen ({len(TOP_VOCABULARY)} Wörter)")

# === 4️⃣ Binary Bag-of-Words mit CountVectorizer ===
vectorizer = CountVectorizer(
    vocabulary=TOP_VOCABULARY,   # 1000 meistverwendete Wörter
    lowercase=True,
    binary=True,                 # 0/1 statt Häufigkeiten
    token_pattern=r"(?u)\b\w+\b"
)

# Alle Texte in 0/1-Vektoren umwandeln
X_bin = vectorizer.transform(df["text"])
print("✅ Binary Feature-Matrix erstellt!")
print("Matrix-Form:", X_bin.shape)  # z. B. (6090, 1000)

# === 5️⃣ Binary Feature-Matrix als DataFrame anzeigen ===
binary_df = pd.DataFrame(
    X_bin.toarray(),
    columns=vectorizer.get_feature_names_out()
)

print("Vorschau der Binary Feature-Matrix:")
print(binary_df.head())

# === 6️⃣ Optional: Matrix speichern ===
SAVE_PATH = "../Data/binary_vectors.parquet"
binary_df.to_parquet(SAVE_PATH)
print(f"✅ Binary Feature-Matrix gespeichert unter: {SAVE_PATH}")


Arbeitsverzeichnis: /Users/hikmetacig/grundlagen-des-nlp-ws25_26/Abgabe/Notebooks
Datei existiert? True
✅ Datensatz geladen!
Größe: (6090, 3)
Spalten: ['text', 'label_name', 'label']
                                                text  label_name  \
0  lumber beat rapid game western division final ...  ['sports']   
1         hear eli gold announce auburn game dumbass  ['sports']   
2       phone away try look home game ticket october  ['sports']   
3  year ago louisville struggle beat fcs opponent...  ['sports']   
4  know dodger oriole game thursday fox arguably ...  ['sports']   

                                               label  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
✅ TOP_VOCABULARY geladen (1000 Wörter)
✅ Binary Feature-Matrix erstellt!
Matri