In [50]:
import pandas as pd
import collections
import numpy as np

In [2]:
def mail_df (csv):
    dfmail = pd.read_csv(csv)
    #clean up mail horoscope, which has offers in each section
    #until the scraper is sorted
    dfmail.replace({'Your Week Ahead: ':'',
                    '-':' ',
                    ''' Now, if you found this forecast helpful, then here's a chance to get more spookily accurate, personalised, written and spoken predictions sent directly to you... 100% FREE. The help you need, exactly when you need it. No commitment, no credit card details, no strings! You have nothing to lose. Just enter your email address here.''':''}, regex = True, inplace = True)
    return dfmail

In [3]:
import string
def prep_and_count(df):
    
    #preparation for removing punctuation
    punct = '!"#$%&\()*+,-./:;<=>?@[\\]^_`{}~'
    transtab = str.maketrans(dict.fromkeys(punct, ''))
    #join all horos together, remove punctuation, make lower case
    v = ''.join(df.reading[0:11].tolist()).lower()
    w = v.translate(transtab).split()
    
    #count into a dictionary
    counts = collections.Counter(w)
    
    #create a dataframe from the count dictionary
    lst = []
    for x in counts.keys():
        lst.append([x, counts[x]])  #previously I needed .decode('utf-8') but somehow using translate earlier did this on its own
    counted_frame = pd.DataFrame(lst, columns = ['Word', 'Freq'])
    
    return counted_frame

In [4]:
def comparable(counts_frame):
    
    #take the frame of word counts, tweak and twiddle into something to compare with Eng_5000_words
    counts_frame.sort_values(by = 'Freq', ascending = False, inplace = True)
    counts_frame['RankHoro'] = [x for x in range(len(counts_frame))]
    counts_frame.reset_index(inplace = True)
    counts_frame.drop('index', axis = 1, inplace = True)
    
    return counts_frame

In [5]:
def eng_5000_base():
    eng_5000 = pd.read_csv('Eng_5000_words2.csv')
    
    #remove fussy parts-of-speech distinctions, sort and add new rankings
    #could just export a csv to refer to and remove this code
    tidy = eng_5000.groupby('Word').sum()
    tidy.sort_values('Frequency', ascending = False, inplace = True)
    tidy['Rank5000'] = [x for x in range(len(tidy))]
    
    return tidy

The mission will be to compare frequency of words by ranking, in each horoscope, with the top 5000 Eng words.

**Which words come up more often in horos than in normal language?**

In [6]:
eng_5000 = eng_5000_base()

In [7]:
def compare_with_5000(comparable_df):    
    #merge horo word count with normal word count
    compare = pd.merge(comparable_df, eng_5000, right_on = 'Word', left_on = 'Word', how = 'inner')
    compare.drop(['Rank', 'Frequency', 'Freq'], axis = 1, inplace = True)

    #compare
    more_common = compare[compare.RankHoro < compare.Rank5000].copy()#needs .copy to show it's not a slice
    more_common['More'] = compare.Rank5000 - compare.RankHoro
    more_common.sort_values('More', ascending = False, inplace = True)
    return more_common

In [8]:
def just_gimme_a_horo(horo_csv):
    #access scraped horo
    df = pd.read_csv(horo_csv)
    df.drop([0], axis = 0, inplace = True) #until the scraper is sorted
    
    #prep and count
    df_counted =  prep_and_count(df)
    
    #prep and compare
    df_comp = comparable(df_counted)
    df_more_common = compare_with_5000(df_comp)
    
    return df_more_common

In [9]:
dfmirror = just_gimme_a_horo ('mirror_horo_2019-02-09.csv')

In [10]:
dfsun = just_gimme_a_horo ('sun_horo_2019-02-09.csv')

In [11]:
dfyahoo = just_gimme_a_horo ('yahoo_horo_2019-02-09.csv')

In [12]:
dfmail = just_gimme_a_horo ('mail_horo_2019-02-09.csv')

That is interesting so far: words like romance, delight, emotionally, ambition all come up ranked higher. But if a word only comes up once in the horoscope, it ranks higher by default - there are fewer than 1000 words so far in the horoscopes. So we could:

**merge all the horoscopes**   or   **only consider words that appear at least twice, or maybe even more often**.

Other aims of the project were to see how much the different horoscope sources agree with each other, eg **across all sources, is the outlook for Aries the same?**

and to see how different the horoscopes are for different star signs - **are they basically saying much the same thing regardless of star sign?**

In [61]:
#merge them all
def horos_to_merge(horo_csv):
    #access scraped horo
    df = pd.read_csv(horo_csv)
    df.drop([0], axis = 0, inplace = True) #until the scraper is sorted
    
    #prep and count
    df_counted =  prep_and_count(df)
    
    #prep and compare
    df_comp = comparable(df_counted)
    
    return df_comp

def mail_horo_to_merge(horo_csv):
    #access scraped horo
    df = mail_df (horo_csv)
    df.drop([0], axis = 0, inplace = True) #until the scraper is sorted
    
    #prep and count
    df_counted =  prep_and_count(df)
    
    #prep and compare
    df_comp = comparable(df_counted)
    
    return df_comp 

In [80]:
dfmirror1 = horos_to_merge ('mirror_horo_2019-02-09.csv')
dfsun1 = horos_to_merge ('sun_horo_2019-02-09.csv')
dfyahoo1 = horos_to_merge ('yahoo_horo_2019-02-09.csv')
dfmail1 = mail_horo_to_merge ('mail_horo_2019-02-09.csv')


def rename (df, source):
    df.rename({'Freq' : 'Freq' + str(source)}, axis = 1, inplace = True)
    df.drop(['RankHoro'], axis=1, inplace = True)
    return df
dfmirror1 = rename(dfmirror1, 'mirror')
dfsun1 = rename(dfsun1, 'sun')
dfyahoo1 = rename(dfyahoo1, 'yahoo')
dfmail1 = rename(dfmail1, 'mail')

all_horos = pd.merge(dfmirror1, dfsun1, right_on = 'Word', left_on = 'Word', how = 'outer')\
            .merge(dfyahoo1, right_on = 'Word', left_on = 'Word', how = 'outer')\
            .merge(dfmail1, right_on = 'Word', left_on = 'Word', how = 'outer')
all_horos.replace({np.nan:0}, inplace = True)

Unnamed: 0,Word,Freqmirror,Freqsun,Freqyahoo,Freqmail
0,you,25.0,32.0,52.0,32.0
1,to,23.0,28.0,40.0,45.0
2,a,19.0,36.0,29.0,29.0
3,your,14.0,26.0,38.0,24.0
4,will,13.0,3.0,2.0,9.0
5,and,11.0,19.0,15.0,17.0
6,be,11.0,5.0,12.0,11.0
7,the,8.0,28.0,33.0,28.0
8,it,8.0,5.0,6.0,19.0
9,or,8.0,5.0,6.0,2.0


In [98]:
all_horos['Total freq'] = all_horos['Freqmirror'] + all_horos['Freqsun'] + all_horos['Freqyahoo'] + all_horos['Freqmail']

#take the frame of word counts, tweak and twiddle into something to compare with Eng_5000_words
all_horos.sort_values(by = 'Total freq', ascending = False, inplace = True)
all_horos['RankAll'] = [x for x in range(len(all_horos))]
all_horos.reset_index(inplace = True)
all_horos.drop('index', axis = 1, inplace = True)

#merge horo word count with normal word count
compare = pd.merge(all_horos, eng_5000, right_on = 'Word', left_on = 'Word', how = 'inner')

#compare
df_more_common = compare[compare.RankAll < compare.Rank5000].copy()#needs .copy to show it's not a slice
df_more_common['More'] = compare.Rank5000 - compare.RankAll
df_more_common.sort_values('More', ascending = False, inplace = True)

df_per_cent_common = df_more_common[df_more_common['Total freq'] > 2].copy()
df_per_cent_common['Horos %'] = df_per_cent_common['Total freq'] / df_per_cent_common['Total freq'].sum()
df_per_cent_common['Eng 5000 %'] = df_per_cent_common['Frequency'] / df_per_cent_common['Frequency'].sum()
df_per_cent_common['More common'] = df_per_cent_common['Horos %'] - df_per_cent_common['Eng 5000 %']
df_per_cent_common.sort_values(by = 'More common',ascending = False, inplace = True)


top_sample = df_per_cent_common[df_per_cent_common['Horos %'] > df_per_cent_common['Eng 5000 %']]
top_sample[['Word', 'Total freq', 'Horos %', 'Frequency', 'Eng 5000 %', 'More common']]

Unnamed: 0,Word,Total freq,Horos %,Frequency,Eng 5000 %,More common
3,your,102.0,0.077685,659622.0,0.010823,0.066861
0,you,141.0,0.107388,3081151.0,0.050556,0.056832
24,love,15.0,0.011424,171176.0,0.002809,0.008616
39,yourself,12.0,0.009139,43401.0,0.000712,0.008427
30,someone,13.0,0.009901,95608.0,0.001569,0.008332
45,complete,11.0,0.008378,58512.0,0.000960,0.007418
13,will,27.0,0.020564,848919.0,0.013929,0.006634
68,being,7.0,0.005331,21695.0,0.000356,0.004975
57,relationship,8.0,0.006093,84549.0,0.001387,0.004706
71,sun,6.0,0.004570,32646.0,0.000536,0.004034


So, some words are as expected: you, your, yourself, as horos are written in second person. Someone also - horos can't give names.
But right up there is love! Also relationship, sun, single, heart, mind, life, wonderful, special ...
On the other hand, past and today are barely more common in horos than in normal language.
Obviously, with 5000 words, there will be very many words that are less common in horos than in normal language. So this is not a worthwhile investigation. Although ... the words with the biggest difference might be interesting.