In [1]:
# Create a baseline model with the minimum accuracy

import pandas as pd
import numpy as np
import os 

In [2]:
# local train directory
train_df = pd.read_csv('data/train.csv')
# print the first 5 rows of the dataframe
print(train_df.head())
# print the shape of the dataframe
print(train_df.shape)
# print the columns of the dataframe
print(train_df.columns)

# check if there is any null data
print(train_df.isnull().sum())

   id  real_text_id
0   0             1
1   1             2
2   2             1
3   3             2
4   4             2
(95, 2)
Index(['id', 'real_text_id'], dtype='object')
id              0
real_text_id    0
dtype: int64


In [3]:
# Load train articles 
TRAIN_DIR = 'data/train'
FILE1_NAME = 'file_1.txt'
FILE2_NAME = 'file_2.txt'
train_folders = os.listdir(TRAIN_DIR)

# print first 5 folders name
print(train_folders[:5])

['article_0045', 'article_0042', 'article_0089', 'article_0074', 'article_0080']


In [4]:
 real_artical_id = train_df.loc[train_df['id'] == 3, 'real_text_id']
print(real_artical_id)

3    2
Name: real_text_id, dtype: int64


In [5]:
import re
# loop through sub training folders and make data frame from them
data = []
for folder in train_folders:
    current_folder_path = os.path.join(TRAIN_DIR, folder)
    # double check if target path is a vlid folder?
    if(os.path.isdir(current_folder_path)):
        file_1_path = os.path.join(current_folder_path, FILE1_NAME)
        file_2_path = os.path.join(current_folder_path, FILE2_NAME)

        with open(file_1_path, 'r') as f1, open(file_2_path, 'r') as f2:
            file_1_text = f1.read()
            file_2_text = f2.read()

        regex_folder_name =  re.search(r'\d+', folder).group()
        article_id = int(regex_folder_name)
        real_artical_id = train_df.loc[train_df['id'] == article_id, 'real_text_id'].values[0]

        # distinguish real or fake text 
        real_text = file_1_text if real_artical_id == 1 else file_2_text
        fake_text = file_2_text if real_artical_id == 1 else file_1_text

        data.append({
            'article_id': article_id,
            'real_text': real_text,
            'fake_text': fake_text
        })
        

formated_data_df = pd.DataFrame(data)
print(formated_data_df.head())
        
        

   article_id                                          real_text  \
0          45  The VLT has enabled two major projects using t...   
1          42  A key question is what causes powerful outflow...   
2          89  The 2006 SPIE Symposium on Astronomical Telesc...   
3          74  The primary mirror design of the European Extr...   
4          80  The goal of this one-day workshop, part of the...   

                                           fake_text  
0  We have undertaken two major projects using th...  
1  A burning question for us is what fuels the mo...  
2  The 2006 SPIE Symposium on Astronomical Telesc...  
3  The primary mirror design for the European Ext...  
4  The goal of this one-day workshop, part of the...  


In [6]:
# reorganize data real_text add label 1. fake add label 0. and text will be convert to vector with word2vec
from gensim.models import Word2Vec

In [7]:
def prepare_data(df):
    # init empty array for hold data
    data_list = []
       
    for _, row in df.iterrows():
        # real txt
        real_vector = text_to_vector(row['real_text'], w2v_model)
        data_list.append({
            'text': row['real_text'],
            'label': 1,
            'vector': real_vector
        })
    
        # fake txt
        fake_vector = text_to_vector(row['fake_text'], w2v_model)
        data_list.append({
            'text': row['fake_text'],
            'label': 0,
            'vector': fake_vector
        })

    new_df = pd.DataFrame(data_list)

    
    
    return new_df

In [9]:
# 1. text helper methods 
def preprocess_text(text):
    text = text.lower()                    
    text = re.sub(r'[^a-zA-Z\s]', '', text) 
    return text.split()        


# 2. turn text to vector
def text_to_vector(text, w2v_model):
    tokens = preprocess_text(text)
    
    # fetch tokens vectors
    vectors = []
    for token in tokens:
        if token in w2v_model.wv:
            vectors.append(w2v_model.wv[token])
    
    # return average ｜ mean because longer text will sum bigger vector which is res we dont want 
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(w2v_model.wv.vector_size)

In [10]:
# train the text 2 vector model with all texts we have 
print("training Word2Vec model...")
all_texts = []
for _, row in formated_data_df.iterrows():
    all_texts.append(preprocess_text(row['real_text']))
    all_texts.append(preprocess_text(row['fake_text']))

w2v_model = Word2Vec(sentences=all_texts, vector_size=100, window=5, min_count=2, workers=4)


training Word2Vec model...


In [11]:
# test out signal example 
test_text = "This is a test sentence"
test_vector = text_to_vector(test_text, w2v_model)
print(f"test content: '{test_text}'")
print(f"output dim: {test_vector.shape}")
print(f"output dim: {test_vector[:5]}")





test content: 'This is a test sentence'
output dim: (100,)
output dim: [-0.65343744  0.7601629   0.35974774  0.20776053  0.42584202]


In [12]:
# get ready for traning
final_train_df = prepare_data(formated_data_df)
print(final_train_df.head())

                                                text  label  \
0  The VLT has enabled two major projects using t...      1   
1  We have undertaken two major projects using th...      0   
2  A key question is what causes powerful outflow...      1   
3  A burning question for us is what fuels the mo...      0   
4  The 2006 SPIE Symposium on Astronomical Telesc...      1   

                                              vector  
0  [-0.3850595, 0.45181227, 0.21268378, 0.1195629...  
1  [-0.41965276, 0.4923063, 0.2313726, 0.12997022...  
2  [-0.39590812, 0.4650436, 0.2184925, 0.12260305...  
3  [-0.4058296, 0.4762467, 0.22334301, 0.12556039...  
4  [-0.38046274, 0.4477853, 0.20888482, 0.1177211...  


In [13]:
#check vector exist
print(final_train_df['vector'].iloc[0])

[-0.3850595   0.45181227  0.21268378  0.11956295  0.2548367  -0.71222335
  0.23883705  0.9614935  -0.2820338  -0.30678248 -0.19005585 -0.65788007
 -0.32269716  0.19566357  0.2604831  -0.1879813   0.21458021 -0.2789142
 -0.0717303  -0.74731326  0.3763354   0.08588213  0.46820936 -0.28876683
 -0.05159499  0.06134443 -0.38268653 -0.1530935  -0.17952447  0.02624926
  0.71888906 -0.20242117 -0.04192891 -0.3702063  -0.06292815  0.5507588
  0.21327262 -0.41083777 -0.28498262 -0.5013271  -0.01956422 -0.53746367
 -0.29426235  0.28117058  0.10457748 -0.10756782 -0.30622965  0.04533314
  0.1767451   0.39287543  0.29109925 -0.3923705  -0.3925068   0.05960498
 -0.55001765  0.1300859   0.57544726 -0.41437635 -0.2881538   0.06690799
 -0.04587173  0.19370058 -0.34602267  0.1811748  -0.3648875   0.37097538
 -0.06311803  0.38362148 -0.18339378  0.32235017  0.02772361  0.3116253
  0.28831902 -0.0807725   0.2720953   0.39508447 -0.19773069  0.13249268
 -0.06610598 -0.1225758  -0.18728998 -0.03222792  0.10

In [14]:
# prepare to train the data 
from sklearn.model_selection import train_test_split
X = np.array(final_train_df['vector'].tolist())  # col to list
y = final_train_df['label'].values        

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

In [16]:
from sklearn.metrics import accuracy_score, classification_report
# 4. predict
y_pred = model.predict(X_test)

# 5. see how accuracy it is 
accuracy = accuracy_score(y_test, y_pred)
print(f"rate: {accuracy:.4f}")

rate: 0.4737


In [17]:
print("\nReport:")
print(classification_report(y_test, y_pred, target_names=['Fake', 'Real']))


Report:
              precision    recall  f1-score   support

        Fake       0.55      0.29      0.37        21
        Real       0.44      0.71      0.55        17

    accuracy                           0.47        38
   macro avg       0.49      0.50      0.46        38
weighted avg       0.50      0.47      0.45        38

