In [1]:
import pandas as pd
import numpy as np

In [44]:
def transform(data):
    # transform the current dataframes (as in gn_gender_all.csv) into 
    # format suitable for 
    words = list(set(data['word']))
    groups = list(set(data['group']))
    counts = list()
    for gr in groups:
        count = list()
        data_group = data.loc[data['group']==gr,]
        for word in words:
            data_word = data_group.loc[data_group['word']==word,]
            if list(data_word.shape)[0]>0:
                count.append(list(data_word['counts'])[0])
            else:
                count.append(0)
        counts.append(count)
    return words,counts

In [16]:
def sumColumn(m, column,exp=1):
    # calculate column sum of a matrix; exp is the exponent 
    # so that the result is (sum(column_data^exp)) and set to 1 by default
    total = 0
    for row in range(len(m)):
        total += m[row][column]**exp
    return total

def sumRow(m, row,exp=1):
    # calculate row sum of a matrix; exp is the exponent 
    # so that the result is (sum(row_data^exp)) and set to 1 by default
    total = 0
    for col in range(len(m[0])):
        total += m[row][col]**exp
    return total


In [33]:
from sklearn.feature_extraction.text import TfidfTransformer


def tfidf(counts,method='scikit-learn'):
    if method=='scikit-learn':
        # The scikit-learn method
        transformer = TfidfTransformer()
        result = list(transformer.fit_transform(counts).toarray())
        for row in range(len(counts)):
            result[row] = list(result[row])
    else:
        result = []
        for row in range(len(counts)):
            result.append([])
        for col in range(len(counts[0])):
            colSum = sumColumn(counts,col)
            for row in range(len(counts)):
                if method=='conditional':
                    # conditional probability of the entry being in a certain group 
                    # given that a certain word is observed
                    result[row].append(float(counts[row][col])/colSum)
                elif method=='log-conditional':
                    # similar to conditional, just instead of dividing the total 
                    # frequency of each word, divide the log of it so that the 
                    # result is less extreme 
                    result[row].append(float(counts[row][col])/np.log(colSum))
                else:
                    print "method not found"
                    return result
        # Then do l2 normalization
        for row in range(len(result)):
            norm = sumRow(result,row,2)
            for col in range(len(result[0])):
                result[row][col] = result[row][col]**2/norm
                    
    return result

In [75]:
def wordCloudString2(frequency,words,scale=1000):
    # Take the frequency calculated from tf-idf to generate a string for 
    # word cloud making.
    wordCloudStr = "" 
    for i in range(len(words)):
        word = words[i]
        # scale is a constant to be multiplied to the frequency to get the actual
        # number of times a word is repeated in the string, so that this number
        # is >1 for all words and proportional to their frequencies.
        for j in range(int(frequency[i]*scale)):
            wordCloudStr += word
            wordCloudStr += " "
    return wordCloudStr

In [88]:
Data = pd.read_csv("../gn_age_group_all.csv")
Data['group']

0      0
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     0
29     0
      ..
270    5
271    5
272    5
273    5
274    5
275    5
276    5
277    5
278    5
279    5
280    5
281    5
282    5
283    5
284    5
285    5
286    5
287    5
288    5
289    5
290    5
291    5
292    5
293    5
294    5
295    5
296    5
297    5
298    5
299    5
Name: group, dtype: int64

In [89]:
words,counts = transform(Data)

In [90]:
x1 = tfidf(counts,method='scikit')
x2 = tfidf(counts,method='conditional')
x3 = tfidf(counts,method='log-conditional')

In [101]:
wordCloudString2(x1[1],words,100)

'houston houston houston houston houston houston houston houston houston houston random random random random facts facts facts facts facts facts facts hate hate hate hate hate hate hate hate hate hate hate member member member member member member member simple simple simple simple simple simple simple good good good good good know know know know know know know know know know know lasik lasik lasik lasik lasik lasik lasik lasik lasik like like like like like like like like like like like like like like like like like like like like like banking banking banking banking banking banking banking banking someone someone someone someone someone someone someone someone someone people people people people people people people people people seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo seo home home home home home home home home girl girl girl girl girl girl depressi