In [None]:
from utils import *

# Modelling with Dataset 1
## Reading the Dataset 1

In [None]:
# Import data
true_data = pd.read_csv("../../Data/dataset_1/clean_data/true_clean_data.csv", index_col=0)
fake_data = pd.read_csv("../../Data/dataset_1/clean_data/fake_clean_data.csv", index_col=0)

true_data.head()

## Train-Test-Split

In [None]:
true_X = true_data["text_preprocessed"].values
true_y = true_data["class_label"].values

fake_X = fake_data["text_preprocessed"].values
fake_y = fake_data["class_label"].values

In [None]:
# Splitting true and fake data into training and test subsets
true_X_train, true_X_test, true_y_train, true_y_test = train_test_split(true_X, true_y, test_size = 0.2, random_state=99)
fake_X_train, fake_X_test, fake_y_train, fake_y_test = train_test_split(fake_X, fake_y, test_size = 0.2, random_state=99)

# Splitting training data into train and validation subsets
true_X_train, true_X_val, true_y_train, true_y_val = train_test_split(true_X_train, true_y_train, test_size = 0.2, random_state=99)
fake_X_train, fake_X_val, fake_y_train, fake_y_val = train_test_split(fake_X_train, fake_y_train, test_size = 0.2, random_state=99)

In [None]:
# Text 
X_train_text = np.concatenate((true_X_train,fake_X_train))
X_val_text = np.concatenate((true_X_val, fake_X_val))
X_test_text = np.concatenate((true_X_test, fake_X_test))

# Labels
y_train = np.concatenate((true_y_train, fake_y_train))
y_val = np.concatenate((true_y_val, fake_y_val))
y_test = np.concatenate((true_y_test, fake_y_test))

## Using CountVectoriser with Bag of Words, Unigrams

In [None]:
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,1))
vectorizer.fit(X_train_text)

X_train = vectorizer.transform(X_train_text)
X_val = vectorizer.transform(X_val_text)
X_test = vectorizer.transform(X_test_text)

In [None]:
# initialise LinearSVC
svm_clf = LinearSVC()

# train model
svm_clf.fit(X_train, y_train)

In [None]:
y_val_pred = svm_clf.predict(X_val)
print(classification_report(y_val, y_val_pred))

In [None]:
y_test_pred = svm_clf.predict(X_test)
print(classification_report(y_test, y_test_pred))

## Using CountVectoriser with Bag of Words, Unigrams + Bigrams

In [None]:
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2))
vectorizer.fit(X_train_text)

X_train = vectorizer.transform(X_train_text)
X_val = vectorizer.transform(X_val_text)
X_test = vectorizer.transform(X_test_text)

In [None]:
# train model
svm_clf.fit(X_train, y_train)

In [None]:
y_val_pred2 = svm_clf.predict(X_val)
print(classification_report(y_val, y_val_pred2))

In [None]:
y_test_pred2 = svm_clf.predict(X_test)
print(classification_report(y_test, y_test_pred2))

## Using CountVectoriser with Bag of Words, Bigrams

In [None]:
vectorizer = CountVectorizer(stop_words='english', ngram_range=(2,2))
vectorizer.fit(X_train_text)

X_train = vectorizer.transform(X_train_text)
X_val = vectorizer.transform(X_val_text)
X_test = vectorizer.transform(X_test_text)

In [None]:
# train model
svm_clf.fit(X_train, y_train)

In [None]:
y_val_pred3 = svm_clf.predict(X_val)
print(classification_report(y_val, y_val_pred3))

In [None]:
y_test_pred3 = svm_clf.predict(X_test)
print(classification_report(y_test, y_test_pred3))

## Using Tf-ldf and Unigrams

In [None]:
tfidf_params = {'unigram':(1,1), 'unigram and bigram': (1,2), 'bigram':(2,2)}

for ngram, values in tfidf_params.items():
    tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=values)
    tfidf_vectorizer.fit(X_train_text)

    X_train = tfidf_vectorizer.transform(X_train_text)
    X_val = tfidf_vectorizer.transform(X_val_text)
    X_test = tfidf_vectorizer.transform(X_test_text)

    svm_clf = LinearSVC()
    print(f"Model with {ngram}")
    svm_clf.fit(X_train, y_train)

    # Validation Data
    print("Testing using validation data:")    
    y_val_pred = svm_clf.predict(X_val)
    print(classification_report(y_val, y_val_pred))
    print("------------------------------------------")

    # Test Data
    print("Testing using test data:")
    y_test_pred = svm_clf.predict(X_test)
    print(classification_report(y_test, y_test_pred))
    print("------------------------------------------")
    print("------------------------------------------")

# Modelling with Combined Datasets (Dataset 1 & Dataset 2)
## Reading the Combined Dataset

In [None]:
train_data = pd.read_csv("../../Data/Combined data/train_data.csv")
val_data = pd.read_csv("../../Data/Combined data/validation_data.csv")
test_data = pd.read_csv("../../Data/Combined data/test_data.csv")

train_data.sample(5)

In [None]:
X_train_text = train_data["text_preprocessed"].values
y_train = train_data["class_label"].values

X_val_text = val_data["text_preprocessed"].values
y_val = val_data["class_label"].values

X_test_text = test_data["text_preprocessed"].values
y_test = test_data["class_label"].values

## Using CountVectoriser with Bag of Words, Unigrams

In [None]:
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,1))
vectorizer.fit(X_train_text)

X_train = vectorizer.transform(X_train_text)
X_val = vectorizer.transform(X_val_text)
X_test = vectorizer.transform(X_test_text)

In [None]:
print("number of features used:", len(vectorizer.get_feature_names()))

In [None]:
# Sparse vector of frequency of each word appearing in a text article
print(X_train)

In [None]:
# train model
svm_clf = LinearSVC()
svm_clf.fit(X_train, y_train)

In [None]:
y_val_pred = svm_clf.predict(X_val)
print(classification_report(y_val, y_val_pred))

In [None]:
y_test_pred = svm_clf.predict(X_test)
print(classification_report(y_test, y_test_pred))

## Using CountVectoriser with Bag of Words, Unigrams + Bigrams

In [None]:
vectorizer5 = CountVectorizer(stop_words='english', ngram_range=(1,2))
vectorizer5.fit(X_train_text)

X_train = vectorizer5.transform(X_train_text)
X_val = vectorizer5.transform(X_val_text)
X_test = vectorizer5.transform(X_test_text)

In [None]:
# train model
svm_clf.fit(X_train, y_train)

In [None]:
y_val_pred2 = svm_clf.predict(X_val)
print(classification_report(y_val, y_val_pred2))

In [None]:
y_test_pred2 = svm_clf.predict(X_test)
print(classification_report(y_test, y_test_pred2))

## Using CountVectorizer with Bag of Word, Bigrams only

In [None]:
vectorizer3 = CountVectorizer(stop_words='english', ngram_range=(2,2))
vectorizer3.fit(X_train_text)

X_train = vectorizer3.transform(X_train_text)
X_val = vectorizer3.transform(X_val_text)
X_test = vectorizer3.transform(X_test_text)

In [None]:
# train model
svm_clf.fit(X_train, y_train)

In [None]:
y_val_pred3 = svm_clf.predict(X_val)
print(classification_report(y_val, y_val_pred3))

In [None]:
y_test_pred3 = svm_clf.predict(X_test)
print(classification_report(y_test, y_test_pred3))

## Using Tf-ldf

In [None]:
tfidf_params = {'unigram':(1,1), 'unigram and bigram': (1,2), 'bigram':(2,2)}

for ngram, values in tfidf_params.items():
    tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=values)
    tfidf_vectorizer.fit(X_train_text)

    X_train = tfidf_vectorizer.transform(X_train_text)
    X_val = tfidf_vectorizer.transform(X_val_text)
    X_test = tfidf_vectorizer.transform(X_test_text)

    svm_clf = LinearSVC()
    print(f"Model with {ngram}")
    svm_clf.fit(X_train, y_train)

    # Validation Data
    print("Testing using validation data:")    
    y_val_pred = svm_clf.predict(X_val)
    print(classification_report(y_val, y_val_pred))
    print("------------------------------------------")

    # Test Data
    print("Testing using test data:")
    y_test_pred = svm_clf.predict(X_test)
    print(classification_report(y_test, y_test_pred))
    print("------------------------------------------")
    print("------------------------------------------")

# Modelling with Feature Selection & Added Features (Combined Data)
## Reading the Combined Data

In [None]:
train_data = pd.read_csv("../../Data/Data with added features/Final datasets/train_data.csv")
val_data = pd.read_csv("../../Data/Data with added features/Final datasets/val_data.csv")
test_data = pd.read_csv("../../Data/Data with added features/Final datasets/test_data.csv")

train_data.sample(5)

In [None]:
X_train_text = train_data["text_preprocessed"].values
y_train = train_data["class_label"].values

X_val_text = val_data["text_preprocessed"].values
y_val = val_data["class_label"].values

X_test_text = test_data["text_preprocessed"].values
y_test = test_data["class_label"].values

## Using Tf-Idf, Unigrams + Bigrams
### Feature Selection, 134 Features (min_df = 0.15)

In [None]:
print('--------------------TF-IDF--------------------')

tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.15)
tfidf_vectorizer.fit(X_train_text)

X_train = tfidf_vectorizer.transform(X_train_text)
X_val = tfidf_vectorizer.transform(X_val_text)
X_test = tfidf_vectorizer.transform(X_test_text)

print(f'TF-IDF Model with min_df=0.15')
svm_clf.fit(X_train, y_train)
num_features = len(tfidf_vectorizer.get_feature_names())
print(num_features)

#Validation Data
print('Testing with validation data:')
val_pred = svm_clf.predict(X_val)
print(classification_report(y_val, val_pred))
print("------------------------------------------")

# Test Data
print('Testing using test data:')
test_pred = svm_clf.predict(X_test)
report = classification_report(y_test, test_pred)
print(report)
print("------------------------------------------")
print("------------------------------------------")

### Feature Selection, 3k Features (min_df = 0.01)

In [None]:
print('--------------------TF-IDF--------------------')

tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.01)
tfidf_vectorizer.fit(X_train_text)

X_train = tfidf_vectorizer.transform(X_train_text)
X_val = tfidf_vectorizer.transform(X_val_text)
X_test = tfidf_vectorizer.transform(X_test_text)

print(f'TF-IDF Model with min_df=0.01')
svm_clf.fit(X_train, y_train)
num_features = len(tfidf_vectorizer.get_feature_names())
print(num_features)
# tfidf_numfeatures.append(num_features)

#Validation Data
print('Testing with validation data:')
val_pred = svm_clf.predict(X_val)
print(classification_report(y_val, val_pred))
print("------------------------------------------")

# Test Data
print('Testing using test data:')
test_pred = svm_clf.predict(X_test)
report = classification_report(y_test, test_pred)
print(report)
print("------------------------------------------")
print("------------------------------------------")

### All Added Features, 134 Features (min_df = 0.15)

In [None]:
# Initialise TfidfVectorizer with min_df = 0.15 as per feature selection
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'sentence_count', 'prop_unique_words',
    'avg_sentence_length', 'prop_punctuations', 'prop_stopwords',
    'prop_words_in_quotes', 'prop_nouns', 'prop_verbs', 'prop_adjectives',
    'prop_discourse_relations', 'textblob_sentiment'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

# fit_transform mapper on train data with added features and transform test data with added features
X_train_added_features = mapper.fit_transform(train_data)
X_test_added_features = mapper.transform(test_data)

svm_clf.fit(X_train_added_features, y_train)
y_pred = svm_clf.predict(X_test_added_features)
print(classification_report(y_test, y_pred))

### All Added Features, 3k Features (min_df = 0.01)

In [None]:
# Initialise TfidfVectorizer with min_df = 0.01 as per feature selection
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'sentence_count', 'prop_unique_words',
    'avg_sentence_length', 'prop_punctuations', 'prop_stopwords',
    'prop_words_in_quotes', 'prop_nouns', 'prop_verbs', 'prop_adjectives',
    'prop_discourse_relations', 'textblob_sentiment'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

# fit_transform mapper on train data with added features and transform test data with added features
X_train_added_features = mapper.fit_transform(train_data)
X_test_added_features = mapper.transform(test_data)

svm_clf.fit(X_train_added_features, y_train)
y_pred = svm_clf.predict(X_test_added_features)
print(classification_report(y_test, y_pred))

### Selected Added Features, 134 Features (min_df = 0.15)

In [None]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'prop_unique_words', 'avg_sentence_length', 'prop_punctuations', 'prop_stopwords', 'prop_nouns'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

# fit_transform mapper on train data with added features and transform test data with added features
X_train_added_features = mapper.fit_transform(train_data)
X_test_added_features = mapper.transform(test_data)

svm_clf.fit(X_train_added_features, y_train)
y_pred = svm_clf.predict(X_test_added_features)
print(classification_report(y_test, y_pred))

### Selected Added Features, 3k Features (min_df = 0.01)

In [None]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'prop_unique_words', 'avg_sentence_length', 'prop_punctuations', 'prop_stopwords', 'prop_nouns'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

# fit_transform mapper on train data with added features and transform test data with added features
X_train_added_features = mapper.fit_transform(train_data)
X_test_added_features = mapper.transform(test_data)

svm_clf.fit(X_train_added_features, y_train)
y_pred = svm_clf.predict(X_test_added_features)
print(classification_report(y_test, y_pred))

# Fine-Tuning the Model for Feature Selection (Unigram + Bigram)

In [None]:
# fine tuning parameters
params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }

# initialise Decision Tree
svm_clf = GridSearchCV(LinearSVC(random_state=1, max_iter=1000000), param_grid=params)

## Using Tf-Idf 
### Feature selection, 134 Features (min_df = 0.15)

In [None]:
print('--------------------TF-IDF--------------------')

tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.15)
tfidf_vectorizer.fit(X_train_text)

X_train = tfidf_vectorizer.transform(X_train_text)
X_val = tfidf_vectorizer.transform(X_val_text)
X_test = tfidf_vectorizer.transform(X_test_text)

print(f'TF-IDF Model with min_df=0.15')
svm_clf.fit(X_train, y_train)
num_features = len(tfidf_vectorizer.get_feature_names())
print(num_features)

#Validation Data
print('Testing with validation data:')
val_pred = svm_clf.predict(X_val)
print(classification_report(y_val, val_pred))
print("------------------------------------------")

print('Best Parameters : ',svm_clf.best_params_)

# Test Data
print('Testing using test data:')
test_pred = svm_clf.predict(X_test)
report = classification_report(y_test, test_pred)
print(report)
print("------------------------------------------")
print("------------------------------------------")

print('Best Parameters : ',svm_clf.best_params_)

### Feature selection, 3k Features (min_df = 0.01)

In [None]:
print('--------------------TF-IDF--------------------')

tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.01)
tfidf_vectorizer.fit(X_train_text)

X_train = tfidf_vectorizer.transform(X_train_text)
X_val = tfidf_vectorizer.transform(X_val_text)
X_test = tfidf_vectorizer.transform(X_test_text)

print(f'TF-IDF Model with min_df=0.01')
svm_clf.fit(X_train, y_train)
num_features = len(tfidf_vectorizer.get_feature_names())
print(num_features)

#Validation Data
print('Testing with validation data:')
val_pred = svm_clf.predict(X_val)
print(classification_report(y_val, val_pred))
print("------------------------------------------")

print('Best Parameters : ',svm_clf.best_params_)

# Test Data
print('Testing using test data:')
test_pred = svm_clf.predict(X_test)
report = classification_report(y_test, test_pred)
print(report)
print("------------------------------------------")
print("------------------------------------------")

print('Best Parameters : ',svm_clf.best_params_)

### All Added Features, 134 Features (min_df = 0.15)

In [None]:
# Initialise TfidfVectorizer with min_df = 0.15 as per feature selection
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'sentence_count', 'prop_unique_words',
    'avg_sentence_length', 'prop_punctuations', 'prop_stopwords',
    'prop_words_in_quotes', 'prop_nouns', 'prop_verbs', 'prop_adjectives',
    'prop_discourse_relations', 'textblob_sentiment'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

# fit_transform mapper on train data with added features and transform test data with added features
X_train_added_features = mapper.fit_transform(train_data)
X_test_added_features = mapper.transform(test_data)

svm_clf.fit(X_train_added_features, y_train)
y_pred = svm_clf.predict(X_test_added_features)
print(classification_report(y_test, y_pred))
print("------------------------------------------")
print("------------------------------------------")

print('Best Parameters : ',svm_clf.best_params_)

### All Added Features, 3k Features (min_df = 0.01)

In [None]:
# Initialise TfidfVectorizer with min_df = 0.01 as per feature selection
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'sentence_count', 'prop_unique_words',
    'avg_sentence_length', 'prop_punctuations', 'prop_stopwords',
    'prop_words_in_quotes', 'prop_nouns', 'prop_verbs', 'prop_adjectives',
    'prop_discourse_relations', 'textblob_sentiment'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

# fit_transform mapper on train data with added features and transform test data with added features
X_train_added_features = mapper.fit_transform(train_data)
X_test_added_features = mapper.transform(test_data)

svm_clf.fit(X_train_added_features, y_train)
y_pred = svm_clf.predict(X_test_added_features)
print(classification_report(y_test, y_pred))
print("------------------------------------------")
print("------------------------------------------")

print('Best Parameters : ',svm_clf.best_params_)

### Selected Added Features, 134 Features (min_df = 0.15)

In [None]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'prop_unique_words', 'avg_sentence_length', 'prop_punctuations', 'prop_stopwords', 'prop_nouns'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

# fit_transform mapper on train data with added features and transform test data with added features
X_train_added_features = mapper.fit_transform(train_data)
X_test_added_features = mapper.transform(test_data)

svm_clf.fit(X_train_added_features, y_train)
y_pred = svm_clf.predict(X_test_added_features)
print(classification_report(y_test, y_pred))
print("------------------------------------------")
print("------------------------------------------")

print('Best Parameters : ',svm_clf.best_params_)

### Selected Added Features, 3k Features (min_df = 0.01)

In [None]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'prop_unique_words', 'avg_sentence_length', 'prop_punctuations', 'prop_stopwords', 'prop_nouns'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

# fit_transform mapper on train data with added features and transform test data with added features
X_train_added_features = mapper.fit_transform(train_data)
X_test_added_features = mapper.transform(test_data)

svm_clf.fit(X_train_added_features, y_train)
y_pred = svm_clf.predict(X_test_added_features)
print(classification_report(y_test, y_pred))
print("------------------------------------------")
print("------------------------------------------")

print('Best Parameters : ',svm_clf.best_params_)