## Importing files sorted on filename ##

Here we import your .txt files and their filenames. We sort the filenames to keep the order of your index.

nb. you need to install the `natsort` module: https://anaconda.org/anaconda/natsort
    , which you can either do from the Anaconda Prompt or from the Anaconda Navigator.
    If you use the prompt write: `conda install natsort`

In [None]:
import os, glob
from natsort import natsorted

def read_txt_dir(dirpath):
    """ import all .txt files from directory of directory path dirpath
        - output file and filename in list
    """
    filenames = natsorted(glob.glob(os.path.join(dirpath,"*.txt")))
    files = list()
    for filename in filenames:
        with open(filename,"r") as fobj:
            files.append(fobj.read())
    filenames = [filename.split("/")[-1] for filename in filenames]
    return files, filenames 

# import articles
article_path = os.path.join("dat","articles")
articles, article_names = read_txt_dir(article_path)

# import magazines
magazine_path = os.path.join("dat","magazines")
magazines, magazine_names = read_txt_dir(magazine_path)

## Tokenization and basic corpus statistics ##

Start by computing basic corpus statistics. For this you need several functions for preprocessing your string data. We use `re` to remove punctuation and `NLTK` for tokenization. The functionality can be implemented with `re` alone.

In [None]:
######################################## TOKENIZE ###########################################

import re

# function to tokenize and lowercase strings
def tokenize(input, length = 0, casefold = True):   # ignore tokens shorter than or equal to 3
    tokenizer = re.compile(r'[^A-Za-z]+')           # only retain chars
    if casefold:                                    # lowercase if casefold = True
        input = input.lower()
    tokens = [token for token in tokenizer.split(input) if len(token) > length]
    return tokens

flatten = lambda l: [item for sublist in l for item in sublist]

articles_tokens = list(map(tokenize,articles))      # tokenize all articles in list
tokens = sorted(flatten(articles_tokens))           # all tokens in one (flat) sorted list
n_tokens = len(tokens)                              # total number of tokens
n_types = len(list(set(tokens)))                    # number of unique tokens

print("The corpus consist of {} tokens distributed over {} lexical types".format(n_tokens, n_types))
print("The lexical richness measured as the type-token ratio is {}".format(round(n_types/n_tokens,4)))
print("On average every word is repeated {} times".format(round(n_tokens/n_types,2)))

## Stopwords ##

In [None]:
from collections import defaultdict
from operator import itemgetter

# function to generate stopword list from dataset with n number of stopwords
def gen_ls_stoplist(input, n = 100):
    t_f_total = defaultdict(int)
    for text in input:
        for token in text:
            t_f_total[token] += 1
    nmax = sorted( t_f_total.items(), key = itemgetter(1), reverse = True)[:n]
    return [elem[0] for elem in nmax]

# generate stopword list from articles with 50 stopwords
sw = gen_ls_stoplist(articles_tokens, 50)

import io

# function to read txt-file and store content in string
def read_txt(filepath):
    f = io.open(filepath, 'r', encoding = 'utf-8')
    content = f.read()
    f.close()
    return content

nltksw = read_txt('Stopwords/english')      # save nltk stopword list in variable
#nltksw = read_txt('Stopwords/english.txt') ### NOTICE THAT I HAVE CHANGE THE FILE PATH
nltksw = tokenize(nltksw)                   # tokenize nltk stopword list

# apply sw
no_sw = list()
for article in articles_tokens:
    out = [token for token in article if token not in sw]
    no_sw.append(out)

# apply nltksw
no_nltksw = list()
for article in articles_tokens:
    out = [token for token in article if token not in nltksw]
    no_nltksw.append(out)

# apply sw + nltksw
no_swall = list()
for article in no_sw:
    out = [token for token in article if token not in nltksw]
    no_swall.append(out)

## Filter 3 char tokens ##

In [None]:
################################### REMOVE 3 CHAR TOKENS #######################################


# remove 3 char tokens from articles without sw
clean_nosw = list()
for article in no_sw:
    out = [token for token in article if len(token)>3]
    clean_nosw.append(out)

# remove 3 char tokens from articles without nltksw
clean_nonltksw = list()
for article in no_nltksw:
    out = [token for token in article if len(token)>3]
    clean_nonltksw.append(out)

# remove 3 char tokens from articles without sw + nltk
clean_noswall = list()
for article in no_swall:
    out = [token for token in article if len(token)>3]
    clean_noswall.append(out)


# Stemming

In [None]:
######################################### STEMMING  #############################################

from nltk.stem import PorterStemmer

ps = PorterStemmer()

# stemming no sw
stem_nosw = list()
for article in clean_nosw:
    out = [ps.stem(token) for token in article]
    stem_nosw.append(out)

# stemming no nltk
stem_nonltksw = list()
for article in clean_nonltksw:
    out = [ps.stem(token) for token in article]
    stem_nonltksw.append(out)

# stemming no sw all
stem_noswall = list()
for article in clean_noswall:
    out = [ps.stem(token) for token in article]
    stem_noswall.append(out)

## New corpus statistics

In [None]:
################################### NEW CORPUS STATISTICS ########################################


clean_tokens = sorted(flatten(stem_noswall))            # all tokens in one (flat) sorted list
n_clean_tokens = len(clean_tokens)                      # total number of tokens
n_clean_types = len(list(set(clean_tokens)))            # number of unique tokens


print("The corpus now consist of {} tokens distributed over {} lexical types".format(n_clean_tokens, n_clean_types))
print("The new lexical richness measured as the type-token ratio is now {}".format(round(n_clean_types/n_clean_tokens,4)))
print("On average every word is now repeated {} times".format(round(n_clean_tokens/n_clean_types,2)))



# Word frequency

In [None]:
from collections import Counter

# word frequency of entire dataset
wf_all = Counter(flatten(articles_tokens))

# wf of stem no sw all
wf_stem_noswall = Counter(flatten(stem_noswall))

# Topic modeling

In [None]:
# import pos_tag to tag parts of speech
from nltk.tag import pos_tag
# tagging tokens
tag_all_tokens = pos_tag(tokens, tagset = 'universal', lang = 'eng')

# create list with articles only with noun tokens
article_nouns = list()
for article in articles:
    tokenz = tokenize(article, length = 3, casefold = False)        
    tagset = pos_tag(tokenz, tagset = 'universal', lang = 'eng')    # tag tokenz
    tokenz = [tag[0] for tag in tagset if tag[1] in ['NOUN']]       # only retain nouns
    tokenz = [token.lower() for token in tokenz]                    # lowercase
    article_nouns.append(tokenz)

# STOPWORDS
    
# apply sw
nouns_no_sw = list()
for article in article_nouns:
    out = [token for token in article if token not in sw]
    nouns_no_sw.append(out)

# apply sw + nltksw
nouns_no_swall = list()
for article in nouns_no_sw:
    out = [token for token in article if token not in nltksw]
    nouns_no_swall.append(out)

# STEMMING    

# stemming nouns_no_swall
stem_nouns_noswall = list()
for article in nouns_no_swall:
    out = [ps.stem(token) for token in article]
    stem_nouns_noswall.append(out)


# B-O-W    
        
from gensim import corpora

# create dictionary of stem_noswall
dic_stem_noswall = corpora.Dictionary(stem_noswall)
# use dictionary to create bag of words representation of stem_noswall
bow_stem_noswall = [dic_stem_noswall.doc2bow(article) for article in stem_noswall]


# create dictionary of nouns no sw all
dic_nouns_noswall = corpora.Dictionary(stem_nouns_noswall)
# use dictionary to create bag of words representation of articles_nouns
bow_nouns_noswall = [dic_nouns_noswall.doc2bow(article) for article in stem_nouns_noswall]


from gensim import models

# create topics of bow_stem_noswall
k = 50
mdl_noswall = models.LdaModel(bow_stem_noswall, id2word = dic_stem_noswall, num_topics = k, random_state = 1234, iterations = 100, passes = 100)
# print topics
for i in range(k):
    print('Topic',i)
    print([t[0] for t in mdl_noswall.show_topic(i,15)])
    print('-----')

# create topics of bow_nouns_noswall
k = 50
mdl_nouns_noswall = models.LdaModel(bow_nouns_noswall, id2word = dic_nouns_noswall, num_topics = k, random_state = 1234, iterations = 100, passes = 100)
# print topics
for i in range(k):
    print('Topic',i)
    print([t[0] for t in mdl_nouns_noswall.show_topic(i,15)])
    print('-----')


# EXPLORE THE MODELS

import pandas as pd

# function to pair topics with articles
def get_theta(doc_bow, mdl):
    tmp = mdl.get_document_topics(doc_bow, minimum_probability=0)
    return [p[1] for p in tmp]

# make empty dataframe
topics_stem_noswall = pd.DataFrame() 
# pair topics and articles for stem_noswall
for topicnr in range(k):
    topic_name = 'topic %d' %topicnr
    topic_score = []
    print(topicnr)
    for article in range(len(articles)):
        topic_score.append(get_theta(bow_stem_noswall[article], mdl_noswall)[topicnr])
    topic_name = 'topic %d' %topicnr
    topics_stem_noswall[topic_name] = topic_score
    
# make empty dataframe
topics_nouns_noswall = pd.DataFrame() 
# pair topics and articles for nouns_noswall
for topicnr in range(k):
    topic_name = 'topic %d' %topicnr
    topic_score = []
    print(topicnr)
    for article in range(len(articles)):
        topic_score.append(get_theta(bow_nouns_noswall[article], mdl_nouns_noswall)[topicnr])
    topic_name = 'topic %d' %topicnr
    topics_nouns_noswall[topic_name] = topic_score  
    



# Association rules

In [None]:
import pandas as pd
import numpy as np
import sys
from itertools import combinations, groupby
from collections import Counter
from IPython.display import display

def size(obj):
    return "{0:.2f} MB".format(sys.getsizeof(obj) / (1000 * 1000))

# function to return frequency counts for items and item pairs
def freq(iterable):
    if type(iterable) == pd.core.series.Series:
        return iterable.value_counts().rename("freq")
    else: 
        return pd.Series(Counter(iterable)).rename("freq")

# function to return number of unique orders
def order_count(order_item):
    return len(set(order_item.index))

# function to return generator that yields item pairs, one at a time
def get_item_pairs(order_item):
    order_item = order_item.reset_index().as_matrix()
    for order_id, order_object in groupby(order_item, lambda x: x[0]):
        item_list = [item[1] for item in order_object]
              
        for item_pair in combinations(item_list, 2):
            yield item_pair

# function to return frequency and support associated with item
def merge_item_stats(item_pairs, item_stats):
    return (item_pairs
                .merge(item_stats.rename(columns={'freq': 'freqA', 'support': 'supportA'}), left_on='item_A', right_index=True)
                .merge(item_stats.rename(columns={'freq': 'freqB', 'support': 'supportB'}), left_on='item_B', right_index=True))

# function to return name associated with item
def merge_item_name(rules, item_name):
    columns = ['itemA','itemB','freqAB','supportAB','freqA','supportA','freqB','supportB', 
               'confidenceAtoB','confidenceBtoA','lift']
    rules = (rules
                .merge(item_name.rename(columns={'item_name': 'itemA'}), left_on='item_A', right_on='item_id')
                .merge(item_name.rename(columns={'item_name': 'itemB'}), left_on='item_B', right_on='item_id'))
    return rules[columns]              
    


def association_rules(order_item, min_support):

    print("Starting order_item: {:22d}".format(len(order_item)))


    # Calculate item frequency and support
    item_stats             = freq(order_item).to_frame("freq")
    item_stats['support']  = item_stats['freq'] / order_count(order_item) * 100


    # Filter from order_item items below min support 
    qualifying_items       = item_stats[item_stats['support'] >= min_support].index
    order_item             = order_item[order_item.isin(qualifying_items)]

    print("Items with support >= {}: {:15d}".format(min_support, len(qualifying_items)))
    print("Remaining order_item: {:21d}".format(len(order_item)))


    # Filter from order_item orders with less than 2 items
    order_size             = freq(order_item.index)
    qualifying_orders      = order_size[order_size >= 2].index
    order_item             = order_item[order_item.index.isin(qualifying_orders)]

    print("Remaining orders with 2+ items: {:11d}".format(len(qualifying_orders)))
    print("Remaining order_item: {:21d}".format(len(order_item)))


    # Recalculate item frequency and support
    item_stats             = freq(order_item).to_frame("freq")
    item_stats['support']  = item_stats['freq'] / order_count(order_item) * 100


    # Get item pairs generator
    item_pair_gen          = get_item_pairs(order_item)


    # Calculate item pair frequency and support
    item_pairs              = freq(item_pair_gen).to_frame("freqAB")
    item_pairs['supportAB'] = item_pairs['freqAB'] / len(qualifying_orders) * 100

    print("Item pairs: {:31d}".format(len(item_pairs)))


    # Filter from item_pairs those below min support
    item_pairs              = item_pairs[item_pairs['supportAB'] >= min_support]

    print("Item pairs with support >= {}: {:10d}\n".format(min_support, len(item_pairs)))


    # Create table of association rules and compute relevant metrics
    item_pairs = item_pairs.reset_index().rename(columns={'level_0': 'item_A', 'level_1': 'item_B'})
    item_pairs = merge_item_stats(item_pairs, item_stats)
    
    item_pairs['confidenceAtoB'] = item_pairs['supportAB'] / item_pairs['supportA']
    item_pairs['confidenceBtoA'] = item_pairs['supportAB'] / item_pairs['supportB']
    item_pairs['lift']           = item_pairs['supportAB'] / (item_pairs['supportA'] * item_pairs['supportB'])
    
    
    # Return association rules sorted by lift in descending order
    return item_pairs.sort_values('lift', ascending=False)

# create dataframe with articles, words
df = pd.DataFrame(stem_noswall)
# transposing the data for easier handling in R
df = df.transpose()
# writing to csv-file
df.to_csv('stem_noswall.csv')

''' work in R to fit dataframe suitable for code'''

# import the file created in R
orders = pd.read_csv('/Users/emmaelisabethkiis/Desktop/eek-thesis-master/AR_stem_noswall.csv')

# preparing file for the association rules function
orders = orders.set_index('order_id')['product_id'].rename('item_id')


#magic!
rules = association_rules(orders, 0.2)  

print(rules.head())

print('dimensions: {0};   size: {1};   unique_orders: {2};   unique_items: {3}'.format(orders.shape, size(orders), len(orders.index.unique()), len(orders.value_counts())))


woman = rules[rules['item_A'].str.match('woman')]
man = rules[rules['item_A'].str.match('man')]



## Word associations based on Cosine distance between word vectors

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

def cosine_similarity(a,b):
    """ cosine similarity between vectors a and b
    """
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))


# vector space
no_features = 1000# max number of features/words
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
TDM = vectorizer.fit_transform(articles)
lexicon = tfidf_vectorizer.get_feature_names()

# compute distance matrix from cosine similarity
X = TDM.todense() 
n = len(lexicon)
distance_matrix = np.zeros((n,n))

for i in range(n):
    for j in range(n):
        x1 = X[:,i].ravel().tolist()[0]
        x2 = X[:,j].ravel().tolist()[0]
        distance_matrix[i,j] = 1 - cosine_similarity(x1,x2)
        
# write matrix to file
cos_df = pd.DataFrame(distance_matrix)
cos_df.columns = lexicon[:n]
cos_df.index = lexicon[:n]
cos_df.to_csv("dat/cosine_distance.csv")

In [None]:
# example of query the distance matrix

## load the stored distance matrix
cos_df = pd.read_csv("dat/cosine_distance.csv",header=0,index_col=0)

## define query
q1 = "sisters"
q2 = "jihad"

## extract distance score
print("cosine distance between {} and {} is {}".format(q1,q2,cos_df.loc[q1,q2]))