In [2]:
# 2020-01 CAU
# Natural Language Processing and Information Retrieval
# Term Indexing Project (Boolean model)
#
# Jihyo Han 20161856
#
#

import os
import re
import numpy as np
import matplotlib.pyplot as plt

# for stop words
from nltk.corpus import stopwords 
import string

# for saving array as .xls file
import xlsxwriter

import copy

In [3]:
stop_words = stopwords.words('english') + ['th', 'nd', 'i\'m', 'can\'t', 'i\'ve', '']
stop_marks = list(string.punctuation) + ['»','¿','»','â','€','œ','£','ã','¢','˜'] \
              + list(('0','1','2','3','4','5','6','7','8','9'))
print(stop_words, stop_marks)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
def countword(dir, word_count):
    for filename in os.listdir(dir):
        if filename.endswith('.txt'):
            print(filename)
            # ----- reading file ----- #
            f = open(dir+filename, 'r', encoding='UTF8')
            lines = f.read()           # full script text
            lines = lines.lower()      # to check stop_words regardless of case
            f.close()
            # ------------------------ #

            tokens = lines.split() # split script by blank

            # ----- removing stop words ----- #
            for token in tokens:
                if token in stop_words:
                    continue
                else: 
                    token = token.translate({ord(words): '' for words in stop_marks}) # removing stop_marks
                    if token in (stop_words or 'abcdefghijklmnopqrstuvwxyz'):
                        # to double-check after eliminating stop_marks
                        continue
                    else:
                        # ------ counting word frequency
                        new = 1 # new word toggle
                        for idx_word in range(len(word_count)):
                            if token == word_count[idx_word][0]:    # repeat word just counts up
                                word_count[idx_word][1] += 1
                                new = 0
                                break
                        if new == 1: word_count.append([token, 1])  # if the token is new, add to list
            # ------------------------------- #

        else:
            print('!!! --- This is not a .txt file --- !!! \n')

In [23]:
# Put dir where scripts are in
female_dir = '../scripts/Female/'
male_dir = '../scripts/Male/'
female_word_count = []  # Checking word distribution of all movie scripts in folder
male_word_count = []

print('---------- Female Movies START')
countword(female_dir, female_word_count)
female_word_count.sort()
print('---------- Male Movies START')
countword(male_dir, male_word_count)
male_word_count.sort()

---------- Female Movies START
10-Things-I-Hate-About-You.txt
A QUIET PLACE.txt
Beauty-and-the-Beast.txt
Black-Swan.txt
Burlesque.txt
Carrie.txt
Case-39.txt
CHARLE'S ANGELS.txt
Easy-A.txt
Ex-Machina.txt
frozen.txt
Girl-with-the-Dragon-Tattoo,-The.txt
Gravity.txt
Hanna.txt
Heathers.txt
Hollow-Man.txt
I-Spit-on-Your-Grave.txt
INGLOURIOUS BASTERDS.txt
Jackie-brown.txt
Jane-Eyre.txt
Juno.txt
Kill-Bill-Volume-1-&-2.txt
Labyrinth.txt
Legally-Blonde.txt
Little-Mermaid,-The.txt
Mary-Poppins.txt
Mulan.txt
Never-Been-Kissed.txt
Panic-Room.txt
Piano,-The.txt
Prometheus.txt
Proposal,-The.txt
RESIDENT EVIL.txt
Resident-Evil.txt
Roommate,-The.txt
Saving-Mr.-Banks.txt
Scream.txt
Sense-and-Sensibility.txt
Sicario.txt
Silence-of-the-Lambs.txt
Single-White-Female.txt
Sister-Act.txt
Star-Wars-The-Force-Awakens.txt
Terminator.txt
THE DEVIL WEARS PRADA.txt
Thelma-&-Louise.txt
Wanted.txt
WIZARD OF OZ.txt
Wonder-Woman.txt
Zootopia.txt
---------- Male Movies START
Aladdin.txt
American-Beauty.txt
American-Snip

In [24]:
# --- Comparing total word counts between Female vs Male
print('Female word count: ', len(female_word_count))
print('Male word count: ', len(male_word_count))

#print(female_word_count)
#print(male_word_count)

Female word count:  34884
Male word count:  35441


In [25]:
# --- Saving results as .xlsx file
def savexlsx(xlsxfilename, word_count):
    with xlsxwriter.Workbook(xlsxfilename) as workbook:
        worksheet = workbook.add_worksheet()

        for row_num, data in enumerate(word_count):
            worksheet.write_row(row_num, 0, data)
    workbook.close()

savexlsx('female_word_count.xlsx', female_word_count)
savexlsx('male_word_count.xlsx', male_word_count)

In [26]:
# --- Making Inverted Index
# .. because there're only two documents, this model just counts how many documents have a word.
# (not having a posting list of which documents contain words)

# count meaning: female = 0, male = 1, both = 2

# copying male_word_count to total_word_count
total_word_count = copy.copy(male_word_count)
for idx_word in range(len(total_word_count)):
    total_word_count[idx_word][1] = 1   # make all counts to 1 (male)


# Check whether female_word_count words are duplicate
for idx_word in range(len(female_word_count)):
    new = 1
    for dup in range(len(total_word_count)): 
        if total_word_count[dup][0] == female_word_count[idx_word][0]:
            total_word_count[dup][1] = 2 # If both documents have word
            new = 0
            break
    if new == 1: total_word_count.append([female_word_count[idx_word][0],0]) # If not, add to list

In [27]:
total_word_count.sort()
#print(total_word_count)

In [28]:
common = 0 
female = 0
male = 0

for idx_count in range(len(total_word_count)):
    if total_word_count[idx_count][1] == 2: common +=1
    elif total_word_count[idx_count][1] == 1: male +=1
    else: female += 1

print(' total words counts: ', len(total_word_count),\
      '\n common words: ', common,\
      '\n female only words: ', female,\
      '\n male only words: ', male
     )

 total words counts:  50743 
 common words:  19582 
 female only words:  15302 
 male only words:  15859


In [29]:
savexlsx('total_word_count.xlsx', total_word_count)