In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from nltk.stem import PorterStemmer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [57]:
file_path = '/content/drive/My Drive/data.csv'

In [58]:
def stem_text(text):
    ps = PorterStemmer()
    return ' '.join([ps.stem(word) for word in text.split()])

In [59]:
def load_data(filepath):
    try:
        df = pd.read_csv(filepath, dtype="string")
        df = df.dropna()
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

In [73]:
def preprocess_text(corpus):
    ps = PorterStemmer()
    processed_corpus = corpus.fillna("").str.lower().str.replace(r'[^\w\s]', '', regex=True)
    stemmed_corpus = processed_corpus.apply(lambda text: ' '.join([ps.stem(word) for word in text.split()]))
    return stemmed_corpus

In [61]:
def vectorize_text(corpus):
    vectorizer = TfidfVectorizer(analyzer='word', stop_words='english',
                                 token_pattern=r'\b\w+\b', max_df=0.25,
                                 tokenizer=stem_text)
    return vectorizer.fit_transform(corpus), vectorizer


In [62]:
def train_svm(X_train, y_train):
    model = svm.SVC(kernel='linear', random_state=0, C=1.0)
    model.fit(X_train, y_train)
    return model

In [63]:
def plot_decision_boundary(X, y, model):
    color = ['black' if c == 0 else 'lightgrey' for c in y]
    plt.scatter(X[:, 0], X[:, 1], c=color, alpha=0.5)

    w = model.coef_[0]
    a = -w[0] / w[1]
    xx = np.linspace(X[:, 0].min(), X[:, 0].max())
    yy = a * xx - (model.intercept_[0]) / w[1]

    plt.plot(xx, yy, 'r-')
    plt.title("Decision Boundary")
    plt.show()


In [79]:
def main():
    filepath ='/content/drive/My Drive/data.csv'
    try:
        df = pd.read_csv(filepath, dtype="string")
    except Exception as e:
        print(f"Error loading data: {e}")
        return

    df["Disinfo_cases_en"] = preprocess_text(df["Disinfo_cases_en"])

    df["Fakes"] = df["Fakes"].astype("category").cat.codes

    X = df["Disinfo_cases_en"]
    y = df["Fakes"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    vectorizer = TfidfVectorizer(stop_words="english", max_df=0.25)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    model = SVC(kernel="linear", random_state=42, C=1.0)
    model.fit(X_train_tfidf, y_train)

    y_pred = model.predict(X_test_tfidf)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(df["Fakes"].value_counts())


In [80]:
if __name__ == "__main__":
    main()

Classification Report:
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         3
           0       0.00      0.00      0.00         1
           1       0.15      1.00      0.27        11
           3       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         2
          11       0.00      0.00      0.00         2
          12       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          20       1.00      1.00      1.00         1
          21       0.00      0.00      0.00         3
          22       0.00      0.00      0.00         4
          23       0.00      0.00      0.00         3
          24       0.00      0.00      0.00         1
    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Oversampling fejków

In [97]:
filepath ='/content/drive/My Drive/data.csv'
df = pd.read_csv(filepath, dtype="string")

print("Rozkład klas przed balansowaniem:")
print(df["Fakes"].value_counts())

# Opcjonalne grupowanie rzadkich klas
threshold = 5
df["Fakes"] = df["Fakes"].apply(lambda x: x if df["Fakes"].value_counts()[x] > threshold else -1)

# Zbalansowanie danych (przykład: SMOTE)
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_tfidf, y_train)

# Trenowanie modelu na zbalansowanych danych
model.fit(X_resampled, y_resampled)

# Ocena modelu
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


Rozkład klas przed balansowaniem:
Fakes
Conflict in Eastern Ukraine is solely internal, i.e, a civil war           71
Russia did not occupy Crimea. It was a reunification                       43
Russia is not involved in MH17 crash; Ukraine shot down the plane          28
The West is fueling the war in Ukraine                                     24
The West uses Ukraine in its actions against Russia                        22
                                                                           ..
Ukrainian gas pipeline is worn out                                          1
Donbas separated because nationalists came to power in Kyiv                 1
NATO is making provocations and advancing to Russia's borders               1
Ukrainian authorities are preparing the next provocation against Russia     1
The Ukrainian Church supports war and nationalist rhetoric                  1
Name: count, Length: 72, dtype: Int64


KeyError: <NA>

In [98]:
print(df["Fakes"].isna().sum())

12


In [100]:
df = df.dropna(subset=["Fakes"])
df["Fakes"] = df["Fakes"].fillna(-1)
df["Disinfo_cases_en"] = df["Disinfo_cases_en"].fillna("")
if df["Fakes"].isna().any():
    print("Brakujące wartości w kolumnie Fakes")
    df["Fakes"] = df["Fakes"].fillna(-1)

In [105]:
def main():
    filepath = '/content/drive/My Drive/data.csv'
    try:
        df = pd.read_csv(filepath, dtype="string")
    except Exception as e:
        print(f"Error loading data: {e}")
        return

    # Wypełnij brakujące wartości
    df["Fakes"] = df["Fakes"].fillna("N/A")  # Użycie wartości tekstowej
    df["Disinfo_cases_en"] = df["Disinfo_cases_en"].fillna("")

    # Preprocessing
    processed_text = preprocess_text(df["Disinfo_cases_en"])
    df["Disinfo_cases_en"] = processed_text

    X = df["Disinfo_cases_en"]
    y = df["Fakes"]

    # Podział na dane treningowe i testowe
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Wektoryzacja
    vectorizer = TfidfVectorizer(stop_words="english", max_df=0.25, max_features=1000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Trenowanie modelu
    from sklearn.svm import LinearSVC  # Szybszy niż SVC
    model = LinearSVC(random_state=42, C=1.0)
    model.fit(X_train_tfidf, y_train)

    # Predykcja i ewaluacja
    y_pred = model.predict(X_test_tfidf)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(df["Fakes"].value_counts())

if __name__ == "__main__":
    main()


Classification Report:
                                                                                                                                                                                precision    recall  f1-score   support

                                                                                                                              Anti-Russian neo-Nazism is florishing in Ukraine       0.00      0.00      0.00         1
                                                                                                              Conflict in Eastern Ukraine is solely internal, i.e, a civil war       0.46      0.55      0.50        11
                                                                                                                               Delays in the Nord Stream 2 cause crisis in EU        1.00      1.00      1.00         3
                                                                                                                

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
