__Files created__
- doc2vec_features.csv

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Automatic Feature Extraction

In [2]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
df = pd.read_csv('/content/drive/MyDrive/학교/Dissertation/Features/text.csv')
df

Unnamed: 0,title,page_id,text,text_tags,quality
0,Mayan languages,182013,The Mayan languages form a language family spo...,{{short description|Language family spoken in ...,FA
1,Mu'awiya I,207068,"Mu'awiya I (Arabic: معاوية بن أبي سفيان, roman...",{{Short description|Founder of the Umayyad Cal...,FA
2,The Fountainhead,180464,The Fountainhead is a 1943 novel by Russian-Am...,{{short description|1943 novel by Ayn Rand}}\n...,FA
3,Northern pintail,218361,The pintail or northern pintail (Anas acuta) i...,{{Short description|Migratory duck that breeds...,FA
4,Manhattan Project,19603,The Manhattan Project was a research and devel...,{{Short description|World War II Allied nuclea...,FA
...,...,...,...,...,...
5195,Party of Democratic Kampuchea,265468,The Party of Democratic Kampuchea was a politi...,{{short description|Former political party in ...,Stub
5196,Minawara and Multultu,95240,"In Aboriginal mythology, Minawara and Multultu...",{{Short description|Figures in Australian Abor...,Stub
5197,Theophylline/ephedra/hydroxyzine,262652,Theophylline/ephedra/hydroxyzine (trade name M...,{{Short description|Drug used for treatment of...,Stub
5198,"Channel Lake, Illinois",111450,Channel Lake is an unincorporated community an...,{{Use mdy dates|date=July 2023}}\n{{Infobox se...,Stub


In [4]:
# Encoding quality class
df['quality'] = np.where((df['quality']=='FA')|(df['quality']=='GA'), 1, 0)
df

Unnamed: 0,title,page_id,text,text_tags,quality
0,Mayan languages,182013,The Mayan languages form a language family spo...,{{short description|Language family spoken in ...,1
1,Mu'awiya I,207068,"Mu'awiya I (Arabic: معاوية بن أبي سفيان, roman...",{{Short description|Founder of the Umayyad Cal...,1
2,The Fountainhead,180464,The Fountainhead is a 1943 novel by Russian-Am...,{{short description|1943 novel by Ayn Rand}}\n...,1
3,Northern pintail,218361,The pintail or northern pintail (Anas acuta) i...,{{Short description|Migratory duck that breeds...,1
4,Manhattan Project,19603,The Manhattan Project was a research and devel...,{{Short description|World War II Allied nuclea...,1
...,...,...,...,...,...
5195,Party of Democratic Kampuchea,265468,The Party of Democratic Kampuchea was a politi...,{{short description|Former political party in ...,0
5196,Minawara and Multultu,95240,"In Aboriginal mythology, Minawara and Multultu...",{{Short description|Figures in Australian Abor...,0
5197,Theophylline/ephedra/hydroxyzine,262652,Theophylline/ephedra/hydroxyzine (trade name M...,{{Short description|Drug used for treatment of...,0
5198,"Channel Lake, Illinois",111450,Channel Lake is an unincorporated community an...,{{Use mdy dates|date=July 2023}}\n{{Infobox se...,0


In [5]:
# Function to preprocess the text for Doc2Vec
def doc2vec_preprocessing(text):
    # Split the text by '== References ==' and similar titles, then take the first part
    text_split = re.split(r'==\s*(?:See also|References|Notes|Footnotes|Sources|External links|Works cited|Further reading)\s*==', text, maxsplit=1)[0]

    # Remove '\n' and '\t'
    text_cleaned = re.sub(r'\n+', ' ', text_split)
    text_cleaned = re.sub(r'\t+', ' ', text_cleaned)

    # Convert to lowercase
    text_cleaned = text_cleaned.lower()

    # Remove punctuation
    text_cleaned = text_cleaned.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text_cleaned = re.sub(r'\d+', '', text_cleaned)

    # Tokenize text
    words = word_tokenize(text_cleaned)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join words back into a single string
    processed_text = ' '.join(words)

    return processed_text

In [None]:
df = df[['title', 'page_id', 'text', 'quality']]
df['text'] = df['text'].apply(doc2vec_preprocessing)
df

In [None]:
# Create Doc2Vec with the size of 20
# Preprocess the text (tokenization)
df['doc2vec_20'] = df['text'].apply(lambda x: word_tokenize(x))

# Create TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df['doc2vec_20'])]

# Train a Doc2Vec model
model = Doc2Vec(documents, vector_size=20, min_count=1, workers=4)

# Infer vectors for each document in the DataFrame
df['doc2vec_20'] = df['doc2vec_20'].apply(lambda x: model.infer_vector(x))
df

In [None]:
# Create Doc2Vec with the size of 50
# Preprocess the text (tokenization)
df['doc2vec_50'] = df['text'].apply(lambda x: word_tokenize(x))

# Create TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df['doc2vec_50'])]

# Train a Doc2Vec model
model = Doc2Vec(documents, vector_size=50, min_count=1, workers=4)

# Infer vectors for each document in the DataFrame
df['doc2vec_50'] = df['doc2vec_50'].apply(lambda x: model.infer_vector(x))
df

In [None]:
# Create Doc2Vec with the size of 100
# Preprocess the text (tokenization)
df['doc2vec_100'] = df['text'].apply(lambda x: word_tokenize(x))

# Create TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df['doc2vec_100'])]

# Train a Doc2Vec model
model = Doc2Vec(documents, vector_size=50, min_count=1, workers=4)

# Infer vectors for each document in the DataFrame
df['doc2vec_100'] = df['doc2vec_100'].apply(lambda x: model.infer_vector(x))
df

In [None]:
# Create Doc2Vec with the size of 200
# Preprocess the text (tokenization)
df['doc2vec_200'] = df['text'].apply(lambda x: word_tokenize(x))

# Create TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df['doc2vec_200'])]

# Train a Doc2Vec model
model = Doc2Vec(documents, vector_size=50, min_count=1, workers=4)

# Infer vectors for each document in the DataFrame
df['doc2vec_200'] = df['doc2vec_200'].apply(lambda x: model.infer_vector(x))
df

In [None]:
# Create Doc2Vec with the size of 300
# Preprocess the text (tokenization)
df['doc2vec_300'] = df['text'].apply(lambda x: word_tokenize(x))

# Create TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df['doc2vec_300'])]

# Train a Doc2Vec model
model = Doc2Vec(documents, vector_size=50, min_count=1, workers=4)

# Infer vectors for each document in the DataFrame
df['doc2vec_300'] = df['doc2vec_300'].apply(lambda x: model.infer_vector(x))
df

In [None]:
# Create Doc2Vec with the size of 500
# Preprocess the text (tokenization)
df['doc2vec_500'] = df['text'].apply(lambda x: word_tokenize(x))

# Create TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df['doc2vec_500'])]

# Train a Doc2Vec model
model = Doc2Vec(documents, vector_size=50, min_count=1, workers=4)

# Infer vectors for each document in the DataFrame
df['doc2vec_500'] = df['doc2vec_500'].apply(lambda x: model.infer_vector(x))
df

In [None]:
df.to_csv('/content/drive/MyDrive/학교/Dissertation/Dissertation_final/Feature Extraction/Data Preprocessing/doc2vec_features.csv')