In [None]:
# import libraries

import pandas as pd
import os
import re
import ast
import MeCab
import math

In [None]:
# environment set up
os.environ["LANG"] = "en_US.UTF-8"

In [None]:
# CLEAN TEXT COLUMN (4)

url_df = pd.read_csv('url_df.csv')
url_df = url_df.drop(columns=['Unnamed: 0'])

def remove_url(text):
    url_regex = re.compile(r'[\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff66-\uff9f]')
    http = re.search('htt', text)
    www = re.search('www.', text)
    if (http is None) & (www is None):
        return text
    elif http is None:
        object_span_www = www.span()[0]
        for w in range(object_span_www, len(text)):
            url_search = url_regex.search(text[w])
            if (url_search is not None) or (w == (len(text)-1)):
                end_index = w
                url_string = text[object_span_www:end_index]
                cleaned_text = text.replace(url_string, '')
                return cleaned_text
            else:
                continue

    else:
        object_span_http = http.span()[0]
        for z in range(object_span_http, len(text)):
            url_search = url_regex.search(text[z])
            if (url_search is not None) or (z == (len(text)-1)):
                end_index = z
                url_string = text[object_span_http:end_index]
                cleaned_text = text.replace(url_string, '')
                return cleaned_text
            else:
                continue

segmented_text = []
for i in range(len(url_df)):
    print('Processing ' + str(i) + ' out of ' + str(len(url_df)))
    old_text = url_df['text'][i]
    new_text = old_text.replace("\n", "") # remove new line
    new_text = new_text.replace("\'", '')
    new_text = re.sub(r'ー{2,}', '', new_text) # remove repetitions of ー
    new_text = remove_url(new_text)
    new_text = remove_url(new_text)
    new_text = remove_url(new_text)
    new_text = remove_url(new_text)
    new_text = remove_url(new_text)
    new_text = new_text.replace('(笑)', '')
    new_text = new_text.replace('（笑）', '')
    new_text = re.sub(r'[0-9]+', '', new_text)
    new_text = re.sub(r'[０-９]+', '', new_text)

    new_text = new_text.replace("・・・", '。')
    new_text = new_text.replace("…", '。')
    new_text = new_text.replace("!?", '。')
    new_text = new_text.replace("?!", '。')
    new_text = new_text.replace("！？", '。')
    new_text = new_text.replace("？！", '。')
    new_text = new_text.replace("?)", '。')
    new_text = new_text.replace("？）", '。')
    new_text = new_text.replace("!)", '。')
    new_text = new_text.replace("！）", '。')
    new_text = new_text.replace("？", '。')
    new_text = new_text.replace("！", '。')
    new_text = new_text.replace("?", '。')
    new_text = new_text.replace("!", '。')
    new_text = new_text.replace("<break>", '。')

    new_text = re.sub(r'[^ぁ-んァ-ン一-龯。ーA-Za-z]', '', new_text) # remove all symbols other than english and japanese characters

    new_text = new_text.split('。')
    if '' in new_text:
        new_text[:] = (value for value in new_text if value != '')
    segmented_text = segmented_text + [new_text]

url_df['segmented_text'] = segmented_text

sequence_num = []
for i in range(len(url_df)):
    print('Processing ' + str(i) + ' out of ' + str(len(url_df)))
    sequence_num = sequence_num + [len(url_df['segmented_text'][i])]

url_df['sequence_length'] = sequence_num
url_df = url_df.drop_duplicates(subset=['url'], keep="first")
url_df = url_df.drop_duplicates(subset=['text'], keep="first")

url_df = url_df.reset_index(drop=True)

segmented_text_joined = []
for m in range(len(url_df)):
    print('Processing ' + str(m) + ' out of ' + str(len(url_df)))
    joined_text = ''.join(url_df['segmented_text'][m])
    segmented_text_joined = segmented_text_joined + [joined_text]

url_df['segmented_text_joined'] = segmented_text_joined
url_df.to_csv('url_df_cleaned.csv', sep=',', encoding='utf-8')

In [None]:
# SEGMENT THE TEXT INTO SENTENCE SEQUENCES (5)

seed_words = []
urls = []
seq_sentence_ns = []
texts = []
ranks = []
for i in range(len(url_df)):
    print('Processing ' + str(i) + ' out of ' + str(len(url_df)))
    seed_word = [url_df['seed_word'][i]] * url_df['sequence_length'][i]
    seed_words = seed_words + seed_word
    rank = [url_df['rank'][i]] * url_df['sequence_length'][i]
    ranks = ranks + rank
    url = [url_df['url'][i]] * url_df['sequence_length'][i]
    urls = urls + url
    segmented_text = url_df['segmented_text'][i]
    segmented_text = ast.literal_eval(segmented_text)
    for j in range(url_df['sequence_length'][i]):
        seq_sentence_n = j + 1
        seq_sentence_ns = seq_sentence_ns + [seq_sentence_n]
        text = segmented_text[j]
        texts = texts + [text]


seq_df = pd.DataFrame(columns=['seed_word', 'rank', 'url', 'seq_sentence_n', 'cleaned_segmented_text'])
seq_df = seq_df.assign(seed_word=seed_words, rank=ranks, url=urls, seq_sentence_n=seq_sentence_ns, cleaned_segmented_text=texts)

seq_df.to_csv('sequence_webcrawl_cleaned_df.csv', sep=',', encoding='utf-8')


In [None]:
# CREATE NOUNS LIST FOR TF-IDF (6)

mecab = MeCab.Tagger()

nouns_list = []
for i in range(len(url_df)):
    print('Processing ' + str(i) + ' out of ' + str(len(url_df)))
    mecab_parsed = mecab.parse(url_df['segmented_text_joined'][i])
    parses_1 = mecab_parsed.split('\n')

    for parse in parses_1:
        parse_1 = parse.split('\t')

        for par in parse_1:
            par_1 = par.split(',')[0]

            if par_1 == '名詞':
                noun = parse.split()[0]
                nouns_list = nouns_list + [noun]
            else:
                continue


cleaned_nouns_list = []
for j in range(len(nouns_list)):
    print('Processing ' + str(j) + ' out of ' + str(len(nouns_list)))
    if (nouns_list[j].isalpha()) and (len(nouns_list[j]) <= 2):
        continue
    else:
        cleaned_nouns_list = cleaned_nouns_list + [nouns_list[j]]

cleaned_nouns_list = list(set(cleaned_nouns_list))

with open("nouns_list_all.txt", "w") as output:
    output.write(str(cleaned_nouns_list))


In [None]:
# CREATE TERM FREQUENCY FOR EACH NOUN OVER THE SAME SEED WORDS (8)

word_df = pd.read_csv('word_df.csv')
word_df = word_df.drop(columns=['Unnamed: 0'])

txt_file = open("nouns_list_all.txt", "r")
file_content = txt_file.read()
nouns_list = ast.literal_eval(file_content)
txt_file.close()

concat_texts = []
for i in range(len(word_df)):
    concat_seed_text = ''
    for j in range(len(url_df)):
        if word_df['word'][i] == url_df['seed_word'][j]:
            concat_seed_text = concat_seed_text + url_df['segmented_text_joined'][j]
        else:
            continue
    concat_texts = concat_texts + [concat_seed_text]

if len(concat_texts) == len(word_df):
    print('LOOKS GOOD')
else:
    print('STOP! ERROR')


tf_count = []
for n in range(len(concat_texts)):
    tf_count_by_seed = []
    for m in range(len(nouns_list)):
        num_count = concat_texts[n].count(nouns_list[m])
        tf_count_by_seed = tf_count_by_seed + [num_count]
    tf_count = tf_count + [tf_count_by_seed]


if len(concat_texts) == len(tf_count):
    print('LOOKS VERY VERY GOOD')
else:
    print('STOP! ERROR')


header_vector = []
for p in range(len(nouns_list)):
    header_name = 'tf_by_seed(' + nouns_list[p] + ')'
    header_vector = header_vector + [header_name]

tf_df = pd.DataFrame(tf_count, columns=header_vector)
tf_df.insert(loc=0, column='seed_word', value=word_df['word'])

tf_df.to_csv('tf_by_seed_df.csv', sep=',', encoding='utf-8')


In [None]:
# CREATE DOCUMENT FREQUENCY FOR EACH NOUN (DF): frequency of words over all documents (9)

full_string = ''
for i in range(len(url_df)):
    print('Processing ' + str(i) + ' out of ' + str(len(url_df)))
    full_string = full_string + url_df['segmented_text_joined'][i]

df_count = []
for j in range(len(nouns_list)):
    print('Processing ' + str(j) + ' out of ' + str(len(nouns_list)))
    num_count = full_string.count(nouns_list[j])
    df_count = df_count + [num_count]


data = {'words': nouns_list,
        'document_frequency': df_count
        }

df_df = pd.DataFrame(data)

df_df.to_csv('df_by_all.csv', sep=',', encoding='utf-8')


In [None]:
# CREATE IDF (10)

df_df = pd.read_csv('df_by_all.csv')
df_df = df_df.drop(columns=['Unnamed: 0'])

# get number of documents (25281)
N_value = len(url_df)

# get idf
idf_list = []
for i in range(len(df_df)):
    print('Processing ' + str(i) + ' out of ' + str(len(df_df)))
    n_value = df_df['document_frequency'][i]
    x = (N_value - n_value + 0.5)/(n_value + 0.5)
    idf = math.log(x, 10)
    idf_list = idf_list + [idf]

df_df['idf'] = idf_list
df_df.to_csv('df_by_all.csv', sep=',', encoding='utf-8')



In [None]:
# CREATE TF-IDF DATAFRAME (11)

tf_by_seed_df = pd.read_csv('tf_by_seed_df.csv')
tf_by_seed_df = tf_by_seed_df.drop(columns=['Unnamed: 0'])

df_df = pd.read_csv('df_by_all.csv')
df_df = df_df.drop(columns=['Unnamed: 0'])

word_df = pd.read_csv('word_df.csv')
word_df = word_df.drop(columns=['Unnamed: 0'])

txt_file = open("nouns_list_all.txt", "r")
file_content = txt_file.read()
nouns_list = ast.literal_eval(file_content)
txt_file.close()

tf_idf_list = []
for i in range(len(word_df['word'])):
    print('Processing ' + str(i) + ' out of ' + str(len(word_df)))
    tf_idf_by_seed = []
    for j in range(len(nouns_list)):
        noun = nouns_list[j]
        tf_header = 'tf_by_seed(' + noun + ')'
        tf = tf_by_seed_df[tf_header][i]
        idf_index = df_df[df_df['words'] == noun].index.values
        idf_series = df_df['idf'][idf_index].values
        idf = idf_series[0]
        tf_idf = tf * idf
        tf_idf_by_seed = tf_idf_by_seed + [tf_idf]
    tf_idf_list = tf_idf_list + [tf_idf_by_seed]

tf_idf_df = pd.DataFrame(tf_idf_list, columns=nouns_list)
tf_idf_df.insert(loc=0, column='seed_word', value=word_df['word'])

tf_idf_df.to_csv('tf_idf_by_seed_df.csv', sep=',', encoding='utf-8')
