In [1]:
import os
import math
import json
import itertools
import pandas as pd
import collections
import concurrent.futures as cf

In [3]:
blog_path = "../blogs/"
path_list = os.listdir(path=blog_path)
BLOG_SAMPLE_SIZE = 40
MAX_SEEN_VALUE = .50
MIN_SEEN_VALUE = .01
MATCH_LENIENCY = .50

In [4]:
def strip_blogs(sample_size, blog_path, paths):
    blogs = []
    for ind in range(sample_size):
        file = open(blog_path+paths[ind], "rb")
        posts = [line.decode("utf-8", errors="ignore").strip() for line in file]
        posts = [line for line in posts if (line != "" and line[0] != "<")]
        blogs.append(" ".join(posts))
        file.close()
    return blogs

In [5]:
# splits up a string into a list of words.
# str -> [str]
def get_words(str):
    words = []
    str = str+"."
    last_ind = 0
    for ind in range(len(str)):
        if not (str[ind].isalpha()):
            if last_ind == ind:
                last_ind += 1
                continue
            else:
                words.append(str[last_ind:ind].lower())
                last_ind = ind+1
    return words

In [6]:
def build_people_and_find_words(blogs):
    people = [dict(collections.Counter(blog)) for blog in blogs]
    flattened_blogs = [word for blog in blogs for word in blog]
    all_words_seen = dict(collections.Counter(flattened_blogs))
    return remove_useless_words(people, all_words_seen)

In [7]:
def remove_useless_words(people, all_words_seen):
    total_people = len(people)
    words_to_remove = [word for word in all_words_seen if ((all_words_seen[word]/total_people > MAX_SEEN_VALUE) or (all_words_seen[word]/total_people < MIN_SEEN_VALUE))]
    for word in words_to_remove:
        del all_words_seen[word]
        for person in people:
                if word in person:
                    del person[word]

    for word in [term for term in all_words_seen if term not in words_to_remove]:
        for person in [d for d in people if word not in d ]:
            person[word] = 0
    return (people, all_words_seen)

In [8]:
def term_frequency(person):
    return .5 + (.5*person/person.max())

In [9]:
# cos_dist = sum(ser1*ser2)/(sqrt(sum(ser1^2)) * sqrt(sum(ser2^2)))
def cos_dist(ser1, ser2, tdm, square_sums):
    numerator = tdm[ser1].dot(tdm[ser2])
    ser1_denominator = square_sums[ser1]
    ser2_denominator = square_sums[ser2]
    return 1 - numerator/(ser1_denominator*ser2_denominator)

In [10]:
def find_sums_for_each_person(tdm):
    square_sums = {person:math.sqrt(sum(map((lambda x: x**2), tdm[person]))) for person in tdm}
    return square_sums

def find_IDFs(blogs):
    blogs = list(map(set, blogs))
    blogs = list(map(collections.Counter, blogs))
    c = collections.Counter()
    for blog in blogs:
        c += blog
    return {word:math.log(BLOG_SAMPLE_SIZE/c[word]) for word in c}

In [11]:
def build_tdm(people, IDF_Dict, all_words_seen):
    tdm = pd.DataFrame({path_list[ind]:people[ind] for ind in range(BLOG_SAMPLE_SIZE) }, dtype=float)

    # (This step scales really badly)
    # IDF_Dict = {}
    # for word in all_words_seen:
    #     personCount = len([person for person in tdm if tdm[person][word] > 0])
    #     if personCount > 0:
    #         IDF_Dict[word] = math.log(BLOG_SAMPLE_SIZE / personCount)
    

    # for person in tdm:
    #     tdm[person] = term_frequency(tdm[person])
    #     for i in range(len(person)):
    #         tdm[person][i] = tdm[person][i] * IDF_Dict[tdm[person].keys()[i]]
    return tdm

In [None]:
def parse_occupation(path_name):
    return path_name.split(".")[3]

def find_unique_occupations(path_list):
    uniques = set([parse_occupation(path) for path in path_list])
    return dict(zip(uniques, range(len(uniques))))

def dict_add_person(nodes_and_links, occupations, person):
        occupation = occupations[person.name.split(".")[3]]
        nodes_and_links["nodes"].append({"id": person.name, "group":occupation})
        for relation in person.keys():
            if (person[relation] < MATCH_LENIENCY) and (relation != person.name) : 
                nodes_and_links["links"].append({"source":person.name, "target":relation, "value":1})
                
def output_to_json(write_path, path_list, similarity_frame):
    nodes_and_links = {}
    nodes_and_links["nodes"] = []
    nodes_and_links["links"] = []

    occupations = find_unique_occupations(path_list)

    for person in similarity_frame:
        dict_add_person(nodes_and_links, occupations, similarity_frame[person])

    file = open(write_path, "w")
    file.write(json.dumps(nodes_and_links, sort_keys=True, indent=2))
    file.close()

In [14]:

blogs = strip_blogs(BLOG_SAMPLE_SIZE, blog_path, path_list)
pool = cf.ProcessPoolExecutor()
blogs = list(pool.map(get_words, blogs))
(people, all_words_seen) = build_people_and_find_words(blogs)
IDFs = find_IDFs(blogs)
tdm = build_tdm(people, IDFs, all_words_seen)
# (This step scales really badly)
square_sums = find_sums_for_each_person(tdm)
similarity_frame = pd.DataFrame({person:{relation:cos_dist(person, relation, tdm, square_sums) for relation in tdm} for person in tdm})
#     print(similarity_frame)
#     output_to_json("./writeTest.json", path_list, similarity_frame)

In [42]:
(similarity_frame < .5).all()

Unnamed: 0,1000331.female.37.indUnk.Leo.xml,1004904.male.23.Arts.Capricorn.xml,1005076.female.25.Arts.Cancer.xml,1005545.male.25.Engineering.Sagittarius.xml,1007188.male.48.Religion.Libra.xml,100812.female.26.Architecture.Aries.xml,1008329.female.16.Student.Pisces.xml,1009572.male.25.indUnk.Cancer.xml,1011153.female.27.Technology.Virgo.xml,1011289.female.25.indUnk.Libra.xml,...,1031806.male.17.Technology.Sagittarius.xml,1032153.male.27.Technology.Pisces.xml,1032591.female.24.Banking.Aquarius.xml,1032824.female.15.Student.Libra.xml,1034874.female.43.Publishing.Capricorn.xml,1039136.male.24.Student.Capricorn.xml,1039908.female.16.indUnk.Gemini.xml,1040084.male.17.indUnk.Taurus.xml,1042993.male.15.Student.Sagittarius.xml,1043329.male.23.Government.Pisces.xml
1000331.female.37.indUnk.Leo.xml,0.0,1.0,0.997064,0.982813,0.9968444,0.976605,0.9919984,0.9946085,0.994358,0.9739,...,0.9834353,0.975932,0.993079,0.980395,0.988119,0.9890502,0.991731,0.9811245,0.9861162,0.9910778
1004904.male.23.Arts.Capricorn.xml,1.0,-2.220446e-16,0.968845,0.9778086,0.9962799,0.981613,0.9666699,0.9978813,0.9955658,0.934484,...,0.9484258,0.957928,0.980282,0.960949,0.974321,0.9636644,0.984836,0.9684759,0.9867501,0.9740546
1005076.female.25.Arts.Cancer.xml,0.997064,0.9688447,0.0,0.9825491,0.9980912,0.987617,0.9651513,0.9934774,0.9863488,0.950864,...,0.9757211,0.977409,0.970171,0.974442,0.986824,0.9694585,0.989163,0.9785919,0.9934014,0.9875869
1005545.male.25.Engineering.Sagittarius.xml,0.982813,0.9778086,0.982549,1.110223e-16,0.9377539,0.960803,0.9745033,0.9775027,0.987872,0.895805,...,0.8987364,0.901982,0.970245,0.952985,0.944414,0.9372331,0.975255,0.9665855,0.9520984,0.9323088
1007188.male.48.Religion.Libra.xml,0.996844,0.9962799,0.998091,0.9377539,-2.220446e-16,0.977188,0.9901173,0.9929905,0.9981662,0.97178,...,0.9712167,0.978959,0.994939,0.986157,0.987128,0.983787,0.983875,0.9946319,0.9697014,0.9837603
100812.female.26.Architecture.Aries.xml,0.976605,0.981613,0.987617,0.9608033,0.977188,0.0,0.9843339,0.9902561,0.9847051,0.944086,...,0.9453075,0.949377,0.984365,0.950213,0.973756,0.9708658,0.972603,0.9747705,0.9901425,0.9725877
1008329.female.16.Student.Pisces.xml,0.991998,0.9666699,0.965151,0.9745033,0.9901173,0.984334,-2.220446e-16,0.9920018,0.9953501,0.926073,...,0.9376211,0.963474,0.958931,0.943393,0.984006,0.9532738,0.960929,0.9646091,0.9918268,0.9749987
1009572.male.25.indUnk.Cancer.xml,0.994609,0.9978813,0.993477,0.9775027,0.9929905,0.990256,0.9920018,2.220446e-16,0.9780682,0.97042,...,0.9603746,0.968199,0.992313,0.986485,0.983505,0.9797311,0.995408,0.9855873,0.9966957,0.9762174
1011153.female.27.Technology.Virgo.xml,0.994358,0.9955658,0.986349,0.987872,0.9981662,0.984705,0.9953501,0.9780682,-2.220446e-16,0.958212,...,0.9781561,0.972509,0.990951,0.98625,0.976985,0.9780824,0.99039,0.9821754,0.9965422,0.9875562
1011289.female.25.indUnk.Libra.xml,0.9739,0.9344839,0.950864,0.8958051,0.9717803,0.944086,0.9260733,0.9704203,0.9582121,0.0,...,0.8490693,0.859801,0.93431,0.896484,0.90494,0.8813863,0.931045,0.9036912,0.9699667,0.9235392


In [53]:
less_than = similarity_frame < .5 
greater_than = similarity_frame >.00000001

In [56]:
table = less_than.eq(greater_than)
table.any()

1000331.female.37.indUnk.Leo.xml                  False
1004904.male.23.Arts.Capricorn.xml                False
1005076.female.25.Arts.Cancer.xml                 False
1005545.male.25.Engineering.Sagittarius.xml       False
1007188.male.48.Religion.Libra.xml                False
100812.female.26.Architecture.Aries.xml           False
1008329.female.16.Student.Pisces.xml              False
1009572.male.25.indUnk.Cancer.xml                 False
1011153.female.27.Technology.Virgo.xml            False
1011289.female.25.indUnk.Libra.xml                False
1011311.female.17.indUnk.Scorpio.xml              False
1013637.male.17.RealEstate.Virgo.xml              False
1015252.female.23.indUnk.Pisces.xml               False
1015556.male.34.Technology.Virgo.xml              False
1016560.male.41.Publishing.Sagittarius.xml        False
1016738.male.26.Publishing.Libra.xml              False
1016787.female.24.Communications-Media.Leo.xml    False
1019224.female.27.RealEstate.Libra.xml          