# WORD2VEC Embedding

## Import Statements

In [7]:
import re
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

## Load a csv file and text files onto a Pandas Dataframe

In [2]:
# Specify the year that you would like to conduct data analysis
year = 1940

In [3]:
# Load metadata.csv as a Pandas dataframe
df = pd.read_csv(f'{year}/metadata.csv')

In [4]:
# A fuction to load .txt files from ocr_texts directory
def read_text_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except FileNotFoundError:
        return None  # or return an appropriate response


In [5]:
# Assuming the file path is in the 'file' column
df['text'] = df['file'].apply(lambda x: read_text_file(x))

## Pre-process the text

In order to identify frequently misspelled words and context-specific synonyms of your target word, run the sections "Train Word2Vec" and "Analyze Results" first and compile the list of frequent errors from model.wv.most_similar(positive=['YOUR_TARGET_WORD'], topn=30).

### Improve OCR text

In [6]:
ocr_corrections = {
    # Incorrectly splited words
    'c hinese': 'chinese',
    'ch inese': 'chinese',
    'chi nese': 'chinese',
    'chin ese': 'chinese',
    'chine se': 'chinese',
    'chines e': 'chinese',
    'ja panese': 'japanese',
    'jap anese': 'japanese',
    'japa nese': 'japanese',
    'japan ese': 'japanse',
    'japane se': 'japanese',
    'for eign': 'foreign',
    'manchu ria': 'manchuria',
    'immi grant': 'immigrant',
    'immi gration': 'immigration',
    # Incorrectly spelled words
    'japanse': 'japanese',
    # Context-specific synonyms
    'nipponese': 'japanese'
}

def correct_ocr(text, corrections):
    for error, correction in corrections.items():
        text = text.replace(error, correction)
    return text

In [7]:
def preprocess_text(text):
     # Check if the text is a string
    if not isinstance(text, str):
        # Handle non-string text, e.g., by converting to string or returning a default value
        text = str(text)  # or return a default value like ''
        
    # Remove numbers and special characters from the text
    text = re.sub("[^A-Za-z]+", " ", text)
    # Turn all words into lower case
    text = text.lower()
    # Correct incorrectly split words in the above dictionary
    text = correct_ocr(text, ocr_corrections)
    return text
    
df['processed_text'] = df['text'].apply(preprocess_text)

### Tokenize the text

In [8]:
# Initialze lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
english_stopwords = set(stopwords.words('english'))  # Convert to a set for faster membership checking

def tokenize_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Lemmetize the text
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.lower() not in english_stopwords]
    return processed_tokens

df['tokenized_text'] = df['processed_text'].apply(tokenize_text)

## Train Word2Vec

In [9]:
# Hyperparameters (vector_size = 100, window = 20, min_count = 20)
model = Word2Vec(df['tokenized_text'], vector_size=100, window=20, min_count=20, workers=4)

## Analyze Results

In [10]:
# Change 'japanese' to your target word.
model.wv.most_similar(positive=['japanese'], topn=30)

[('shanghai', 0.7499285936355591),
 ('jap', 0.723640501499176),
 ('japan', 0.7170374393463135),
 ('hai', 0.7092571258544922),
 ('hongkong', 0.6974321007728577),
 ('shang', 0.6955792903900146),
 ('tokyo', 0.6910163164138794),
 ('orient', 0.6898350715637207),
 ('tientsin', 0.6875103116035461),
 ('tokio', 0.674040675163269),
 ('yangtze', 0.6640757322311401),
 ('incident', 0.6627698540687561),
 ('hankow', 0.6591790318489075),
 ('korea', 0.6585856676101685),
 ('peiping', 0.646751344203949),
 ('nippon', 0.6459189653396606),
 ('chinese', 0.6426478624343872),
 ('nese', 0.6389297842979431),
 ('filipino', 0.6320447325706482),
 ('yokohama', 0.6319870352745056),
 ('china', 0.6309860944747925),
 ('abend', 0.6272905468940735),
 ('manchuria', 0.6264177560806274),
 ('chlna', 0.624900758266449),
 ('chungking', 0.6132591366767883),
 ('militarist', 0.6098368763923645),
 ('puppet', 0.6091718077659607),
 ('nanking', 0.6083692312240601),
 ('domei', 0.6013432145118713),
 ('manchoukuo', 0.6012725234031677)]

In [11]:
enemy = model.wv.similarity('japanese', 'enemy')
print(enemy)

0.1410837


In [12]:
immigrant = model.wv.similarity('japanese', 'immigrant')
print(immigrant)

0.23920898


In [13]:
soldier = model.wv.similarity('japanese', 'soldier')
print(soldier)

0.10520072


In [14]:
ally = model.wv.similarity('japanese', 'ally')
print(ally)

0.005692895


In [15]:
american = model.wv.similarity('japanese', 'american')
print(american)

0.17709936


In [16]:
chinese = model.wv.similarity('japanese', 'chinese')
print(chinese)

0.64264786


## Sentiment Score

In [17]:
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/hahm/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [18]:
def filter_sentences(df, keyword):
    # Filter sentences containing the keyword
    return df[df['processed_text'].str.contains(keyword, case=False)]

def analyze_sentiment(sentences):
    # Analyze sentiment of each sentence
    sentiment_scores = [sia.polarity_scores(sentence) for sentence in sentences]
    return sentiment_scores

# Filter sentences
japanese_sentences = filter_sentences(df, 'japanese')

# Analyze sentiment
japanese_sentiment = analyze_sentiment(japanese_sentences['processed_text'])

# Summing up all the compound scores
j_total_compound_score = sum([sentiment['compound'] for sentiment in japanese_sentiment])

# Calculating the average compound score
j_average_compound_score = j_total_compound_score / len(japanese_sentiment)

print(j_average_compound_score)

0.4113111053984594


## Update the CSV file

In [19]:
# Data to be added as a new row
new_data = {
    'year': [year],
    'enemy': [enemy],
    'immigrant': [immigrant],
    'soldier': [soldier],
    'ally': [ally],
    'american': [american],
    'chinese': [chinese],
    'j_average_compound_score': [j_average_compound_score]
}

# File path
file_path = 'cosine.csv'
new_row = pd.DataFrame(new_data)

# Check if the file exists
try:
    # Read the existing CSV file
    df_results = pd.read_csv(file_path)
except FileNotFoundError:
    # If file does not exist, create a new DataFrame
    df_results = pd.DataFrame(columns=new_data.keys())

# Append the new data
df_results= pd.concat([df_results, new_row], ignore_index=True)

# Save the updated DataFrame to CSV
df_results.to_csv(file_path, index=False)

# Display the updated DataFrame
df_results


Unnamed: 0,year,enemy,immigrant,soldier,ally,american,chinese,j_average_compound_score
0,1917,-0.069088,0.507748,0.058896,0.053292,0.242649,0.517685,0.716156
1,1918,0.02767,0.293286,-0.102258,0.111673,0.241416,0.667243,0.435222
2,1919,0.075328,0.458734,0.176309,-0.041869,0.297431,0.528501,0.627945
3,1920,0.134512,0.502782,0.143171,0.053065,0.225031,0.470205,0.624098
4,1921,0.094014,0.392006,0.073137,-0.002816,0.306282,0.424202,0.713038
5,1922,0.007384,0.162523,0.04007,0.087442,0.172594,0.376771,0.727726
6,1923,0.064556,0.408621,0.094039,-0.00835,0.245259,0.59933,0.627035
7,1924,0.102849,0.623948,0.01529,-0.027155,0.375301,0.5453,0.642734
8,1925,0.0279,0.564351,0.109987,0.076618,0.3333,0.553175,0.515829
9,1926,0.135841,0.446116,0.220782,0.084134,0.328506,0.547844,0.723613
