In [1]:
# Load libraries
import numpy as np
import pandas as pd
from __future__ import division
import string
import math
from sklearn.feature_extraction.text import TfidfVectorizer
tokenize = lambda doc: doc.lower().split(" ")

In [2]:
# Load dataset
TwtDs = pd.read_csv('C:\\Users\\Meng\\OneDrive\\Documents\\tamu\\DataScience\\twitter-user-gender-classification\\gender-classifier-DFE-791531.csv',  na_values=['.'], encoding = "ISO-8859-1")

In [3]:
# shape
print(TwtDs.shape) 

(20050, 26)


In [4]:
# head
print(TwtDs.head(10))

    _unit_id _golden _unit_state  _trusted_judgments _last_judgment_at  \
0  815719226   False   finalized                   3    10/26/15 23:24   
1  815719227   False   finalized                   3    10/26/15 23:30   
2  815719228   False   finalized                   3    10/26/15 23:33   
3  815719229   False   finalized                   3    10/26/15 23:10   
4  815719230   False   finalized                   3     10/27/15 1:15   
5  815719231   False   finalized                   3     10/27/15 1:47   
6  815719232   False   finalized                   3     10/27/15 1:57   
7  815719233   False   finalized                   3    10/26/15 23:48   
8  815719234   False   finalized                   3     10/27/15 1:52   
9  815719235   False   finalized                   3     10/27/15 1:49   

   gender  gender:confidence profile_yn  profile_yn:confidence  \
0    male             1.0000        yes                    1.0   
1    male             1.0000        yes              

In [5]:
# Dataset Info
TwtDs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20050 entries, 0 to 20049
Data columns (total 26 columns):
_unit_id                 20050 non-null int64
_golden                  20050 non-null bool
_unit_state              20050 non-null object
_trusted_judgments       20050 non-null int64
_last_judgment_at        20000 non-null object
gender                   19953 non-null object
gender:confidence        20024 non-null float64
profile_yn               20050 non-null object
profile_yn:confidence    20050 non-null float64
created                  20050 non-null object
description              16306 non-null object
fav_number               20050 non-null int64
gender_gold              50 non-null object
link_color               20050 non-null object
name                     20050 non-null object
profile_yn_gold          50 non-null object
profileimage             20050 non-null object
retweet_count            20050 non-null int64
sidebar_color            20050 non-null object
text    

In [6]:
# gender distribution
print(TwtDs.groupby('gender').size())

gender
brand      5942
female     6700
male       6194
unknown    1117
dtype: int64


In [7]:
# get a small portion of the dataset to do the experiment
TwtHd = TwtDs.head(100)

In [8]:
# the sampled dataset only have one hundred records
TwtHd.shape

(100, 26)

In [9]:
TwtHd.text.head(10)

0    Robbie E Responds To Critics After Win Against...
1    ÛÏIt felt like they were my friends and I was...
2    i absolutely adore when louis starts the songs...
3    Hi @JordanSpieth - Looking at the url - do you...
4    Watching Neighbours on Sky+ catching up with t...
5    Ive seen people on the train with lamps, chair...
6    @BpackEngineer Thank you for your patience whi...
7    Gala Bingo clubs bought for å£241m: The UK's l...
8    @_Aphmau_ the pic defines all mcd fangirls/fan...
9    @Evielady just how lovely is the tree this yea...
Name: text, dtype: object

In [10]:
# gender distribution in the sample
print(TwtHd.groupby('gender').size())

gender
brand      28
female     33
male       37
unknown     2
dtype: int64


In [11]:
# Split the dataset by gender
TwtHd_m = TwtHd[TwtHd.gender=='male']
TwtHd_f = TwtHd[TwtHd.gender=='female']
TwtHd_b = TwtHd[TwtHd.gender=='brand']

In [12]:
# Tokenize and normalize the text columns of tweets
sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
sklearn_representation = sklearn_tfidf.fit_transform(TwtHd.text)

In [13]:
# Define the functions to carry out the same tokenize functions as sklearn.feature_extraction.text
def sublinear_term_frequency(term, tokenized_document):
    if tokenized_document.count(term) != 0 :
        return (1 + math.log(tokenized_document.count(term)))
    else:
        return 0

def augmented_term_frequency(term, tokenized_document):
    max_count = max([term_frequency(t, tokenized_document) for t in tokenized_document])
    return (0.5 + ((0.5 * term_frequency(term, tokenized_document))/max_count))

def inverse_document_frequencies(tokenized_documents):
    idf_values = {}
    all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
    for tkn in all_tokens_set:
        contains_token = map(lambda doc: tkn in doc, tokenized_documents)
        idf_values[tkn] = 1 + math.log(len(tokenized_documents)/(sum(contains_token)))
    return idf_values

def tfidf(documents):
    tokenized_documents = [tokenize(d) for d in documents]
    idf = inverse_document_frequencies(tokenized_documents)
    tfidf_documents = []
    for document in tokenized_documents:
        doc_tfidf = []
        for term in idf.keys():
            tf = sublinear_term_frequency(term, document)
            doc_tfidf.append(tf * idf[term])
        tfidf_documents.append(doc_tfidf)
    return tfidf_documents

In [14]:
tfidf_representation = tfidf(TwtHd.text)

In [30]:
print ("TF-IDF Manual Method\n")
print (tfidf_representation[1])
len(tfidf_representation[1])

TF-IDF Manual Method

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.7147984280919264, 0.0, 4.218875824868201, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.8134107167600364, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.605170185988092, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

901

In [31]:
print ("TF-IDF Sklearn Method\n")
print (sklearn_representation.toarray()[1].tolist())
print (sklearn_representation.toarray()[1].shape)
print (TwtHd.text[1])

TF-IDF Sklearn Method

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2752082698280501, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2752082698280501, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1332938971585182, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0