In [1]:
# Create a baseline model with the minimum accuracy
import pandas as pd
import numpy as np
import os 

In [2]:
# local train directory
train_df = pd.read_csv('data/train.csv')
# print the first 5 rows of the dataframe
print(train_df.head())
# print the shape of the dataframe
print(train_df.shape)
# print the columns of the dataframe
print(train_df.columns)

# check if there is any null data
print(train_df.isnull().sum())

   id  real_text_id
0   0             1
1   1             2
2   2             1
3   3             2
4   4             2
(95, 2)
Index(['id', 'real_text_id'], dtype='object')
id              0
real_text_id    0
dtype: int64


In [3]:
# Load train articles 
TRAIN_DIR = 'data/train'
FILE1_NAME = 'file_1.txt'
FILE2_NAME = 'file_2.txt'
train_folders = os.listdir(TRAIN_DIR)

# print first 5 folders name
print(train_folders[:5])

['article_0000', 'article_0001', 'article_0002', 'article_0003', 'article_0004']


In [4]:
 real_artical_id = train_df.loc[train_df['id'] == 3, 'real_text_id']
print(real_artical_id)

3    2
Name: real_text_id, dtype: int64


In [5]:
import re
# loop through sub training folders and make data frame from them
data = []
for folder in train_folders:
    current_folder_path = os.path.join(TRAIN_DIR, folder)
    # double check if target path is a valid folder
    if os.path.isdir(current_folder_path):
        file_1_path = os.path.join(current_folder_path, FILE1_NAME)
        file_2_path = os.path.join(current_folder_path, FILE2_NAME)

        with open(file_1_path, 'r', encoding='utf-8') as f1, open(file_2_path, 'r', encoding='utf-8') as f2:
            file_1_text = f1.read()
            file_2_text = f2.read()

        regex_folder_name = re.search(r'\d+', folder).group()
        article_id = int(regex_folder_name)
        real_artical_id = train_df.loc[train_df['id'] == article_id, 'real_text_id'].values[0]

        data.append({
            'article_id': article_id,
            'file1_text': file_1_text,
            'file2_text': file_2_text,
            'real_text_id': real_artical_id
        })

formated_data_df = pd.DataFrame(data)
print(formated_data_df.head())
print(len(formated_data_df))
        

   article_id                                         file1_text  \
0           0  The VIRSA (Visible Infrared Survey Telescope A...   
1           1  China\nThe goal of this project involves achie...   
2           2  Scientists can learn about how galaxies form a...   
3           3  China\nThe study suggests that multiple star s...   
4           4  Dinosaur Rex was excited about his new toy set...   

                                          file2_text  real_text_id  
0  The China relay network has released a signifi...             1  
1  The project aims to achieve an accuracy level ...             2  
2  Dinosaur eggshells offer clues about what dino...             1  
3  The importance for understanding how stars evo...             2  
4  Analyzing how fast stars rotate within a galax...             2  
95


In [6]:
# with test data
TEST_DIR = 'data/test'
test_folders = os.listdir(TEST_DIR)

test_data = []
for folder in test_folders:
    current_folder_path = os.path.join(TEST_DIR, folder)
    if os.path.isdir(current_folder_path):
        file_1_path = os.path.join(current_folder_path, FILE1_NAME)
        file_2_path = os.path.join(current_folder_path, FILE2_NAME)

        with open(file_1_path, 'r', encoding='utf-8') as f1, open(file_2_path, 'r', encoding='utf-8') as f2:
            file_1_text = f1.read()
            file_2_text = f2.read()

        regex_folder_name = re.search(r'\d+', folder).group()
        article_id = int(regex_folder_name)

        test_data.append({
            'article_id': article_id,
            'file1_text': file_1_text,
            'file2_text': file_2_text
        })
test_df = pd.DataFrame(test_data)

In [7]:
# 1. text helper methods 
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# [better] 
# 1. more words handling change all words back to it orginal form
# 2. bypass all "meaningless" words
def preprocess_text(text , remove_stopwords=True, lemmatize=True, stem=False):
    # conver all text to lower case
    text = text.lower()
    # remain only char letters 
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # split words into array 
    words = text.split()
    # remove stop words. 
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]
    # to words orginal form
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
    
    # words orginal form [ez version]
    # if stem:
    #     stemmer = PorterStemmer()
    #     words = [stemmer.stem(word) for word in words]

    return words
        
# 示例用法
text = "This is an example sentence to demonstrate text preprocessing!"
processed_text = preprocess_text(text)
print(processed_text)

# 2. turn text to vector
def text_to_vector(text, w2v_model):
    tokens = preprocess_text(text)
    
    # fetch tokens vectors
    vectors = []
    for token in tokens:
        if token in w2v_model.wv:
            vectors.append(w2v_model.wv[token])
    
    # return average ｜ mean because longer text will sum bigger vector which is res we dont want 
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(w2v_model.wv.vector_size)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sheng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sheng\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['example', 'sentence', 'demonstrate', 'text', 'preprocessing']


In [8]:
# train the text 2 vector model with all texts we have 
from sklearn.feature_extraction.text import TfidfVectorizer
# reorganize data real_text add label 1. fake add label 0. and text will be convert to vector with word2vec
from gensim.models import Word2Vec

print("training Word2Vec model | and TF-IDF")
all_texts = []
for _, row in formated_data_df.iterrows():
    all_texts.append(preprocess_text(row['file1_text']))
    all_texts.append(preprocess_text(row['file2_text']))
    
for _, row in test_df.iterrows():
    all_texts.append(preprocess_text(row['file1_text']))
    all_texts.append(preprocess_text(row['file2_text']))


w2v_model = Word2Vec(sentences=all_texts, vector_size=128, window=5, min_count=2, workers=4)

all_sentences = [' '.join(text) for text in all_texts]
# print("All sentences for TF-IDF:", all_sentences)

tfidf_vectorizer = TfidfVectorizer(min_df=3, max_df=0.95)
tfidf_vectorizer.fit(all_sentences)
# Debug: Print TF-IDF feature names
print("TF-IDF feature names:", tfidf_vectorizer.get_feature_names_out())
vocab_size = len(tfidf_vectorizer.get_feature_names_out())
print(f"TF-IDF Dim（Vocab size）: {vocab_size}")

training Word2Vec model | and TF-IDF
TF-IDF feature names: ['aan' 'aani' 'aaomega' ... 'zro' 'zu' 'zur']
TF-IDF Dim（Vocab size）: 10100


In [9]:

def prepare_data(df):
    # init empty array for hold data
    data_list = []
       
    for _, row in df.iterrows():
        article_id = row['article_id']
        real_text_id = row.get('real_text_id', None)
        # real txt
        file1_vector = text_to_vector(row['file1_text'], w2v_model)
        file1_tfidf = tfidf_vectorizer.transform([row['file1_text']]).toarray().flatten()
        file2_vector = text_to_vector(row['file2_text'], w2v_model)
        file2_tfidf = tfidf_vectorizer.transform([row['file2_text']]).toarray().flatten()
        data_list.append({
            'article_id': row['article_id'],
            'file1_text': row['file1_text'],
            'file1_vector': file1_vector,
            'file1_tfidf': file1_tfidf,
            'file2_vector': file2_vector,
            'file2_tfidf': file2_tfidf,
            'real_text_id': real_text_id
        })
    new_df = pd.DataFrame(data_list)

    
    
    return new_df

### [better words hanlding] 
- 1. more words handling change all words back to it orginal form
- 2. bypass all "meaningless" words

In [10]:
# test out signal example 
test_text = "This is a test sentence"
test_vector = text_to_vector(test_text, w2v_model)
print(f"test content: '{test_text}'")
print(f"output dim: {test_vector.shape}")
print(f"output dim: {test_vector[:5]}")





test content: 'This is a test sentence'
output dim: (128,)
output dim: [-0.03150508 -0.31145465  0.11689466  0.0894896   0.06723356]


In [12]:
# get ready for traning
final_train_df = prepare_data(formated_data_df)
print(final_train_df.head())

   article_id                                         file1_text  \
0           0  The VIRSA (Visible Infrared Survey Telescope A...   
1           1  China\nThe goal of this project involves achie...   
2           2  Scientists can learn about how galaxies form a...   
3           3  China\nThe study suggests that multiple star s...   
4           4  Dinosaur Rex was excited about his new toy set...   

                                        file1_vector  \
0  [0.0230241, -0.38339213, 0.39330837, 0.2068413...   
1  [0.07953804, -0.3028367, 0.42785868, 0.2871812...   
2  [0.110884614, -0.2103085, 0.6307655, 0.3774753...   
3  [0.08061906, -0.30437815, 0.41041118, 0.260557...   
4  [0.08083893, -0.14955753, 0.1624579, 0.2090943...   

                                         file1_tfidf  \
0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
3  [0.0, 0.0, 0.0, 0.0, 0.0, 0

In [14]:
# prepare to train the data 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# 合并 file1 和 file2 的特征
X_vectors = np.hstack((np.array(final_train_df['file1_vector'].tolist()), np.array(final_train_df['file2_vector'].tolist())))
# X_tfidf = np.hstack((np.array(final_train_df['file1_tfidf'].tolist()), np.array(final_train_df['file2_tfidf'].tolist())))

# double check size is same
# assert X_vectors.shape[0] == X_tfidf.shape[0], "size does not match"
# combin data 
# X_combined = np.hstack((X_vectors, X_tfidf))
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_vectors)
y = final_train_df['real_text_id'].values   

X_train, X_test, y_train, y_test = train_test_split(X_combined_scaled, y, test_size=0.2, random_state=42)

In [30]:
# Train by logisticRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
model = LogisticRegression()
# model = BaggingClassifier(base_estimator=LogisticRegression(), n_estimators=10, random_state=42)
selector = RFE(estimator=base_model, n_features_to_select=20) 
selector = selector.fit(X_train, y_train)
selector.fit(X_train, y_train)
X_train_filtered = selector.transform(X_train)
X_test_filtered = selector.transform(X_test)
# train
model.fit(X_train_filtered, y_train)

In [18]:
# # Train by XGBoost
# from xgboost import XGBClassifier
# # Create xgboost classifier
# model = XGBClassifier()
# xgb_y_train = y_train - 1
# # train start
# model.fit(X_train, xgb_y_train)

In [32]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
# 4. predict
y_pred = model.predict(X_test_filtered)
y_proba = model.predict_proba(X_test_filtered)
# 5. see how accuracy it is 
accuracy = accuracy_score(y_test, y_pred)
print(y_pred)
print(f"rate: {accuracy:.4f}")

# 使用交叉验证
cv_scores = cross_val_score(model, X_combined_scaled, y, cv=5)
print(f"Cross-validation accuracy: {cv_scores.mean():.4f}")

[2 2 1 1 1 2 1 1 1 2 2 2 2 1 1 2 1 1 1]
rate: 0.9474
Cross-validation accuracy: 0.9053


In [20]:
print("\nReport:")
print(classification_report(y_test, y_pred, target_names=['Fake', 'Real']))


Report:
              precision    recall  f1-score   support

        Fake       1.00      0.83      0.91        12
        Real       0.78      1.00      0.88         7

    accuracy                           0.89        19
   macro avg       0.89      0.92      0.89        19
weighted avg       0.92      0.89      0.90        19



In [21]:
print(test_df.head())

   article_id                                         file1_text  \
0           0  "Music" Music music music Music music Music mu...   
1           1  underground exploration on SN's birth has prov...   
2           2  This research aimed to understand how star sha...   
3           3  Using OmegaCAM's wide field capabilities spann...   
4           4  AssemblyCulture AssemblyCulture AssemblyCultur...   

                                          file2_text  
0  Since its launch on Paranal observatory's Very...  
1  SN 1987A provides valuable insights as newer o...  
2  ChromeDriver music player\nThis study focused ...  
3  greek translation :\nvazhi (megaCAM), territor...  
4  XClass is software tool that helps astronomers...  


In [22]:
# # train the text 2 vector model with all texts we have 
# from sklearn.feature_extraction.text import TfidfVectorizer
# # reorganize data real_text add label 1. fake add label 0. and text will be convert to vector with word2vec
# from gensim.models import Word2Vec

# print("training Word2Vec model | and TF-IDF")
# all_texts = []
# for _, row in test_df.iterrows():
#     all_texts.append(preprocess_text(row['file1_text']))
#     all_texts.append(preprocess_text(row['file2_text']))

# w2v_model = Word2Vec(sentences=all_texts, vector_size=128, window=5, min_count=2, workers=4)

# all_sentences = [' '.join(text) for text in all_texts]
# # print("All sentences for TF-IDF:", all_sentences)

# tfidf_vectorizer = TfidfVectorizer(min_df=3, max_df=0.95)
# tfidf_vectorizer.fit(all_sentences)
# # Debug: Print TF-IDF feature names
# print("TF-IDF feature names:", tfidf_vectorizer.get_feature_names_out())
# vocab_size = len(tfidf_vectorizer.get_feature_names_out())
# print(f"TF-IDF Dim（Vocab size）: {vocab_size}")

In [23]:
final_test_df = prepare_data(test_df)
print(final_test_df.head())
X_test_vectors = np.hstack((np.array(final_test_df['file1_vector'].tolist()), np.array(final_test_df['file2_vector'].tolist())))
# X_test_tfidf = np.hstack((np.array(final_test_df['file1_tfidf'].tolist()), np.array(final_test_df['file2_tfidf'].tolist())))
# x_text_inputs = np.hstack((X_test_vectors, X_test_tfidf))
X__test_combined_scaled = scaler.fit_transform(X_test_vectors)
filiter_X__test_combined_scaled = selector.transform(X__test_combined_scaled)
y_test_pred = model.predict(filiter_X__test_combined_scaled)
print(y_test_pred)

   article_id                                         file1_text  \
0           0  "Music" Music music music Music music Music mu...   
1           1  underground exploration on SN's birth has prov...   
2           2  This research aimed to understand how star sha...   
3           3  Using OmegaCAM's wide field capabilities spann...   
4           4  AssemblyCulture AssemblyCulture AssemblyCultur...   

                                        file1_vector  \
0  [0.08115593, -0.22142166, 0.4903126, 0.2310310...   
1  [0.14342688, -0.24130912, 0.39748383, 0.294650...   
2  [0.21412696, -0.12193324, 0.6276861, 0.3805165...   
3  [0.08599428, -0.29641584, 0.49077696, 0.303334...   
4  [0.112046205, -0.2529205, 0.29999223, 0.271616...   

                                         file1_tfidf  \
0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
3  [0.0, 0.0, 0.0, 0.0, 0.0, 0

In [26]:
submission_df = pd.DataFrame({
    'id': final_test_df['article_id'],
    'real_text_id': y_test_pred
})
print(submission_df.head())

   id  real_text_id
0   0             2
1   1             2
2   2             1
3   3             2
4   4             2


In [27]:
submission_df.to_csv('submission.csv', index=False)