In [20]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

from textblob import TextBlob
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter




In [21]:
# load training data
train = pd.read_csv('../data/train2.tsv', sep='\t', header=None)
test = pd.read_csv('../data/test2.tsv', sep='\t', header=None)
column_names = [
    "Index",
    "ID",
    "Label",
    "Statement",
    "Subject",
    "Speaker",
    "Speaker_Job_Title",
    "State_Info",
    "Party_Affiliation",
    "Barely_True_Counts",
    "False_Counts",
    "Half_True_Counts",
    "Mostly_True_Counts",
    "Pants_On_Fire_Counts",
    "Context",
    "Extracted_Justification"
]

train.columns = column_names
test.columns = column_names

In [30]:
def data_clean(df):

    # Fill missing values in text-based columns
    df.replace({'Statement':''}, np.nan, inplace=True)
    df.replace({'Extracted_Justification':''}, np.nan, inplace=True)

    # Drop rows with missing 'Statement' or 'Extracted_Justification' (since these are critical)
    df.dropna(subset=['Statement', 'Extracted_Justification'], inplace=True)

    # Drop rows with missing labels
    df = df.dropna(subset=['Label'])

    # Impute missing values in categorical columns with 'Unknown'
    categorical_columns = ['Speaker', 'Speaker_Job_Title', 'State_Info', 'Party_Affiliation', 'Context']
    df[categorical_columns] = df[categorical_columns].fillna('Unknown')

    # Impute numerical columns (truth counts) with median values
    numeric_columns = ["Barely_True_Counts", "False_Counts", "Half_True_Counts", "Mostly_True_Counts", "Pants_On_Fire_Counts"]
    df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())
    return df

# Check for missing values
print(train.isnull().sum())

train = data_clean(train)
test = data_clean(test)

print(train.isnull().sum())

NameError: name 'data' is not defined

In [23]:
# Label Encoding for categorical variables
label_encoder = LabelEncoder()
train['Label'] = label_encoder.fit_transform(train['Label'])  # Encoding labels like "false", "half-true", etc. to 0, 1...
test['Label'] = label_encoder.fit_transform(test['Label'])  # Encoding labels like "false", "half-true", etc. to 0, 1...

train['Text'] = train['Statement'] + ' ' + train['Extracted_Justification']
test['Text'] = test['Statement'] + ' ' + test['Extracted_Justification']

X_train = train[['Text']]
y_train = train['Label']
X_test = test[['Text']]
y_test = test['Label']

## Content Statistics

In [27]:
#!python -m spacy download en_core_web_sm

In [28]:
# Load a pre-trained NLP model
nlp = spacy.load("en_core_web_sm")

In [29]:
def structural_analysis(statement):
    doc = nlp(statement)
    num_sentences = len(list(doc.sents))
    total_tokens = len([token.text for token in doc])
    
    # Syntactic complexity
    avg_sentence_length = total_tokens / num_sentences if num_sentences > 0 else 0
    tree_depth = max([token.head.i - token.i for token in doc]) if len(doc) > 0 else 0
    
    # Sentiment Analysis
    sentiment = TextBlob(statement).sentiment
    polarity = sentiment.polarity
    subjectivity = sentiment.subjectivity

    return [avg_sentence_length, tree_depth, polarity, subjectivity]

def extract_graph_features(statement):
    doc = nlp(statement)
    pos_counts = Counter([token.pos_ for token in doc])
    entities = Counter([ent.label_ for ent in doc.ents])
    
    # part of speech tagging
    pos_noun = pos_counts.get("NOUN", 0)
    pos_verb = pos_counts.get("VERB", 0)
    pos_adjective = pos_counts.get("ADJ", 0)
    
    # named entity recognition
    num_persons = entities.get("PERSON", 0)
    num_orgs = entities.get("ORG", 0)
    num_gpes = entities.get("GPE", 0)
    
    return [pos_noun, pos_verb, pos_adjective, num_persons, num_orgs, num_gpes]

def extract_comparison_features(statement):
    # Keywords for different LIWC-like categories (simplified)
    cognitive_words = ["think", "know", "understand", "believe"]
    emotional_words = ["happy", "sad", "angry", "fear"]
    social_words = ["friend", "family", "society"]

    # Tokenize statement and count keywords
    vectorizer = CountVectorizer(vocabulary=cognitive_words + emotional_words + social_words)
    word_counts = vectorizer.fit_transform([statement]).toarray().flatten()

    # Divide word counts into different categories
    num_cognitive = sum(word_counts[:len(cognitive_words)])
    num_emotional = sum(word_counts[len(cognitive_words):len(cognitive_words) + len(emotional_words)])
    num_social = sum(word_counts[-len(social_words):])

    return [num_cognitive, num_emotional, num_social]
    
def extract_feature(statement):
    return structural_analysis(statement) + extract_graph_features(statement) + extract_comparison_features(statement)
    

In [8]:
X_train_feature = X_train['Text'].apply(lambda x: extract_feature(x))

In [9]:
X_test_feature = X_test['Text'].apply(lambda x: extract_feature(x))

In [10]:
# Initialize and train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_feature.to_list(), y_train)

# Predictions
y_pred = model.predict(X_test_feature.to_list())

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.20588235294117646
              precision    recall  f1-score   support

           0       0.24      0.16      0.19       210
           1       0.18      0.23      0.20       249
           2       0.22      0.30      0.25       263
           3       0.20      0.25      0.22       240
           4       0.29      0.02      0.04        90
           5       0.19      0.13      0.15       206

    accuracy                           0.21      1258
   macro avg       0.22      0.18      0.18      1258
weighted avg       0.21      0.21      0.20      1258



## Corpus Structure