# Sandbox for PAN experiments

### To do
* ~~Pan 14 and 16 (?) have HTML tags in the text tweets~~ BeautifulSoup is slow
* Pan 14 and 16 have duplicates (between and among?)
* Langauge data



In [5]:
import os

data_dir = "/data/pan17"

# 2014 training twitter
en14 = "pan14-author-profiling-training-dataset-english-twitter-2014-04-16"
es14 = "pan14-author-profiling-training-dataset-spanish-twitter-2014-04-16"
pan14 = [os.path.join(data_dir, d) for d in [en14, es14]]

# 2015 training (all twitter)
# below also have 2015-03-02 versions
du15 = "pan15-author-profiling-training-dataset-dutch-2015-04-23"
en15 = "pan15-author-profiling-training-dataset-english-2015-04-23"
it15 = "pan15-author-profiling-training-dataset-italian-2015-04-23"
es15 = "pan15-author-profiling-training-dataset-spanish-2015-04-23"
pan15 = [os.path.join(data_dir, d) for d in [du15, en15, it15, es15]]

# 2016 training (all twitter)
# below also have 2016-02-29 and 2-16-03-26 versions
du16 = "pan16-author-profiling-training-dataset-dutch-2016-04-25"
en16 = "pan16-author-profiling-training-dataset-english-2016-04-25"
es16 = "pan16-author-profiling-training-dataset-spanish-2016-04-25"
pan16 = [os.path.join(data_dir, d) for d in [du16, en16, es16]]

# 2017 training (all twitter)
pan17_root = "pan17-author-profiling-training-dataset-2017-03-10"
ar17 = os.path.join(pan17_root, "ar")
en17 = os.path.join(pan17_root, "en")
es17 = os.path.join(pan17_root, "es")
pt17 = os.path.join(pan17_root, "pt")
pan17 = [os.path.join(data_dir, d) for d in [ar17, en17, es17, pt17]]

In [1]:
import xmltodict
import logging
import glob
import os
import pandas as pd
import sys
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings("ignore", category=UserWarning)



# Main dataset loading utility
class PanDataLoader:
    
    def __init__(self, logger=None):
        if logger is None:
            logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
            self.log = logging.getLogger(__name__)
        else:
            self.log = logger
                               
    def load_17(self, directory):
            
        """Load and return the pan17 gender and variation twitter dataset.
        ==============                                      ==============
        Samples total                                                10800
        Targets            nominal [{male, female},
                                    {ar, pt, es, en},
                                    {'brazil', 'australia', 'venezuela',
                                     'portugal', 'great britain', 'chile',
                                     'levantine', 'egypt', 'colombia',
                                     'peru', 'ireland', 'argentina',
                                     'maghrebi', 'mexico', 'new zealand',
                                     'spain', 'canada', 'gulf'}]
        ==============                                      ==============
        Parameters
        ----------
        inputdir
        The directory containing the training data, i.e. /data/training.

        Returns
        -------
        data : Pandas dataframe
            The interesting attributes are:
            'text', the data to learn, ['gender','lang', variety],
            the regression targets,
        Examples
        --------
        >>> from datasets import load_pan17
        >>> df_training = load_pan17(inputdir)
        >>> print(df_training.corpus.shape)
        (10800, 5)
        """

        X_docs = glob.glob(os.path.join(directory, '*.xml'), recursive=True)
        Y_doc = os.path.join(directory, 'truth.txt')
        # check that the dataset is loaded correctly

        X_tmp = []
        for t in X_docs:
            with open(t) as f:
                doc = xmltodict.parse(f.read())
            author = os.path.splitext(os.path.basename(t))[0]
            lang = doc['author']['@lang']
            text = doc['author']['documents']['document']
            X_tmp.append((author, lang, text))

        text = pd.DataFrame(X_tmp, columns=["author", "lang", "text"])

        Y_tmp = pd.read_csv(Y_doc,
                             sep='\:\:\:',
                             names=['author', 'gender', 'variety'],
                             engine='python')

        corpus = pd.merge(text, Y_tmp, on='author')
        return corpus
    
    def load_16(self, directory):
        return self.load_14(directory)
    
    def load_15(self, directory):
        X_docs = glob.glob(os.path.join(directory, '*.xml'), recursive=True)
        Y_doc = os.path.join(directory, 'truth.txt')
        X_tmp = []
        for t in X_docs:
            with open(t) as f:
                doc = xmltodict.parse(f.read())
            author = os.path.splitext(os.path.basename(t))[0]
            lang = doc['author']['@lang']
            text = doc['author']['document']
            # print(author, lang, text[:100])
            X_tmp.append((author, lang, text))

        text = pd.DataFrame(X_tmp, columns=["author", "lang", "text"])

        Y_tmp = pd.read_csv(Y_doc,
                             sep='\:\:\:',
                             names=['author', 'gender', 'age', '1','2','3','4', '5'],
                             engine='python') 


        corpus = pd.merge(text, Y_tmp, on='author')
        return corpus
    
    
    def load_14(self, directory):
        errors = 0
        X_docs = glob.glob(os.path.join(directory, '*.xml'), recursive=True)
        Y_doc = os.path.join(directory, 'truth.txt')
        X_tmp = []
        for t in X_docs:
            with open(t) as f:
                try:
                    doc = xmltodict.parse(f.read())
                except Exception as e:
                    self.log.warning(e)
                    self.log.warning("Skipping: {}".format(t))
                    continue
            author = os.path.splitext(os.path.basename(t))[0]
            lang = doc['author']['@lang']
            text = []
            for td in doc['author']['documents']['document']:
                try:
                    t = BeautifulSoup(td['#text'], "lxml").getText()
                    text.append(t)
                except Exception as e:
                    errors += 1
                    # log.warning(e)
                    # self.log.warning("skipping {}".format(td))
                    continue
            X_tmp.append((author, lang, text))

        text = pd.DataFrame(X_tmp, columns=["author", "lang", "text"])

        Y_tmp = pd.read_csv(Y_doc,
                             sep='\:\:\:',
                             names=['author', 'gender', 'age'],
                             engine='python') 

        self.log.warning("Skipped {}".format(errors))

        corpus = pd.merge(text, Y_tmp, on='author')
        return corpus
    
    def _load_all(self, loader_func, directories):
        """Concatenate across languages"""
        corpora = []
        for dr in directories:
            corpus = loader_func(dr)
            corpora.append(corpus)
        return pd.concat(corpora)
    
    def load_all_17(self, directories):
        return self._load_all(self.load_17, directories)
    
    def load_all_16(self, directories):
        return self._load_all(self.load_16, directories)
    
    def load_all_15(self, directories):
        return self._load_all(self.load_15, directories)
    
    def load_all_14(self, directories):
        return self._load_all(self.load_14, directories)
    
    def clean_and_normalize(self, corpus):
        """Standardize to lowercase for gender and langauge, m/f for gender
           Remove personality scores"""
        # FIXME TODO -- how do you do this in place?
        # FIXME TODO -- normalize age ranges?
        corpus['gender'] = corpus['gender'].apply(lambda s: s[0].lower())
        corpus['lang'] = corpus['lang'].apply(lambda s: s.lower())

        for c in ['1', '2', '3', '4', '5']:
            if c in corpus:
                del corpus[c]
        return corpus





In [2]:
pdl = PanDataLoader()

In [6]:
# corpus14 = pdl.load_all_14(pan14)
# corpus15 = pdl.load_all_15(pan15)
# corpus16 = pdl.load_all_16(pan16)
corpus17 = pdl.load_all_17("/data/pan17/pan17-author-profiling-training-dataset-2017-03-10/en/")

FileNotFoundError: [Errno 2] No such file or directory: '/truth.txt'

In [9]:
corpus17

Unnamed: 0,author,lang,text,gender,variety
0,100a149ae0db9d907276e72ad5582a8e,ar,[#فخامه_العماد_المقاوم اللي ما بيتغيّر مش متل ...,female,levantine
1,100dd9f15a7098676f89b8889295855d,ar,"[@faris_dody ربنا يخليه وتفرحي بيه 😘😘, كفاية ش...",female,egypt
2,10120669678faad622ccbc00a25abaf2,ar,[لسه ما لحقت افتح الفيسبوك صارت لاقطيتني يا ال...,female,levantine
3,101d8b7e5c45138f79a381d4ae7802f,ar,"[@Taher_NourEldin شاقط واحدة وبيتغدا معاها, أب...",male,egypt
4,102b9ed9481d0b9e748afbf8ce473890,ar,[@Shereyn3 ده الانجليز اللي احتلونا هما كمان ب...,female,egypt
5,103480fec7cb456be6a7c940b7ebff93,ar,[...\n...\nنحبُّ الشّتاءَ، إذا انقطع الدَّربُ،...,female,gulf
6,10461ae1014ed79def086308df47da26,ar,"[على سعيد الكعبى بيذيع ........متعة, @Egyleg 2...",male,egypt
7,10598977b6bedaa972f12c93cdba158c,ar,[مساء الخير يا أوسخ جمهور في مصر إلا من رحم رب...,male,egypt
8,106086df9c362aa4dcf4c824ada0dc05,ar,[في حدا بس يسويلي لايك برجع بقرأ شو كاتبة \n😂🙈...,female,levantine
9,10a0f001bdf9283986a44946fa66f8a0,ar,"[@allthings_D yas! المجموعة المصرية للعيون, @M...",male,egypt


In [13]:
# find duplicates
authors = [set(corpus['author']) for corpus in [corpus14, corpus15, corpus16, corpus17]]
duplicates = [[0 for _ in range(len(authors))] for __ in range(len(authors))]
for j in range(len(authors)):
    for k in range(len(authors)):
        duplicates[j][k] = str(len(authors[j].intersection(authors[k]))).zfill(3)

for line in duplicates:
    print(line)

['425', '000', '397', '000']
['000', '324', '000', '000']
['397', '000', '594', '000']
['000', '000', '000', '11400']


In [14]:
# there are duplicates between 14 and 16

duplicates = set(corpus14['author']).intersection(set(corpus16['author']))
num_duplicates = len(duplicates)
print(num_duplicates)
print(len(corpus14['author']))
print(len(corpus16['author']))


397
483
678


In [15]:
for c in [corpus14, corpus15, corpus16, corpus17]:
    n_authors = len(c['author'])
    u_authors = len(set(c['author']))
    print("{} {}: {}".format(n_authors, u_authors, n_authors - u_authors))

483 425: 58
324 324: 0
678 594: 84
11400 11400: 0


In [20]:
# removes duplicates but doesn't regard language
corpus16[~corpus16['author'].isin(corpus14['author'])]

Unnamed: 0,author,lang,text,gender,age
3,6e602536dfb72bb2c881fd25e6a92f46,EN,"[Attending <a href=""/hashtag/gbl10?src=hash"" d...",MALE,25-34
5,b9114397ca060ce9de73d8073b0c53c3,EN,[Infographic: Doctors Prescribing More Mobile ...,FEMALE,35-49
6,43d7f732e9da60bc1d8f13294c3a3f1c,EN,"[<a href=""/s99drine"" class=""twitter-atreply pr...",FEMALE,35-49
8,e827776a1e77b7bee940b485c5a4b5b7,EN,"[<a href=""/hashtag/tweetameet?src=hash"" data-q...",MALE,25-34
14,b88171637fa04a302e94b14402f2793a,EN,"[<a href=""/scottsanchez"" class=""twitter-atrepl...",MALE,25-34
16,ca75d60a9ff2c98b575535212f338b6a,EN,"[<a href=""/alexismadrigal"" class=""twitter-atre...",MALE,35-49
18,638475f75a7f842d4ce14e690012a5cf,EN,[OUTSIDE THE STUDIO: Art in the Park w BVAG - ...,FEMALE,50-64
24,a882cbfa5477e22a082c5b0c0c288dfb,EN,"[Sebastian Ingrosso, Tommy Trash, John Martin ...",FEMALE,18-24
25,f5abf96f244c876d17ecf69863cb0abb,EN,"[@socialmurcia1 <a href=""/smerigom"" class=""twi...",FEMALE,35-49
29,29498c0afa48a739013e2a3f8c1e2937,EN,"[Gracias por seguirme: <a href=""/AppleMacWatch...",MALE,35-49


In [9]:
# some authors appear in the same dataset for two different languages
# the tweets are different
corpus16[corpus16['author'].isin(['60d21d384ea52074f685904742b8cd4e'])]


Unnamed: 0,author,lang,text,gender,age
90,60d21d384ea52074f685904742b8cd4e,EN,"[Spectrum con tarjeta ethernet <a href=""/hasht...",MALE,35-49
4,60d21d384ea52074f685904742b8cd4e,ES,"[De <a href=""/juleniturbe"" class=""twitter-atre...",MALE,35-49


In [242]:
# only pan14 and pan16 have duplicates (within and between), so always keep them as part of one split
# i.e. 14 and 16 should be in test or train, but not split between the two

In [40]:
corpus14, corpus15, corpus16, corpus17 = (pdl.clean_and_normalize(c) for c in [corpus14, corpus15, corpus16, corpus17])

In [17]:
corpus14[corpus14['lang'] == 'en']

Unnamed: 0,author,lang,text,gender,age
0,4877dddcc26b8768206f3adb2371193c,en,[The Internet Archive is now home to 10 petaby...,m,25-34
1,255291fff1e306d3332da3464011ee91,en,"[Love, Loyalty and adventure.. Lost in the Mis...",f,50-64
2,9faf9af6f25373fdc722be0e07938a00,en,[Healthy living is a mindset - get in the habi...,f,35-49
3,960c08e2ef0351c0bb50e4a109246880,en,[Happy Moodle 2.1 Release Day! | Moodle News h...,f,25-34
4,b94c091c649fa9b7703a9f4a0ffb3304,en,[#Colombia: El Espectador critica en su editor...,m,35-49
5,de0f088339eda6d3751034d06ea5845f,en,[Hey! Y el EQUIPAZO DE MENTORES tb! #iw_castel...,f,35-49
6,4a9c6df0b335c37ef60611c592c3d154,en,[@Hostgator broke my site in the process of tr...,m,25-34
7,d1ec90379b8333663fec536e79fe29f2,en,[Desengrasante http://instagram.com/p/geATIlRp...,m,35-49
8,1b50196b8436f77669bdee925a2c4dfa,en,[En dan verneem je plots dat je vermeld wordt ...,f,25-34
9,36b2593435e1bed13eb138c1973c13ed,en,[Tax stuff is to the accountant!! Makes you s...,m,50-64


In [18]:
corpus14 = corpus14[corpus14['lang'] == 'en']
corpus15 = corpus15[corpus15['lang'] == 'en']
corpus16 = corpus16[corpus16['lang'] == 'en']
corpus17 = corpus17[corpus17['lang'] == 'en']


In [41]:
corpus14['text'] = corpus14['text'].apply(lambda x: "\n".join(x))
corpus15['text'] = corpus15['text'].apply(lambda x: "\n".join(x))
corpus16['text'] = corpus16['text'].apply(lambda x: "\n".join(x))
corpus17['text'] = corpus17['text'].apply(lambda x: "\n".join(x))

In [56]:
import pickle

with open("corpus15.pickle", "wb") as f:
    pickle.dump(corpus15, f)

In [59]:
with open("corpus15.pickle", "rb") as f:
    test15 = pickle.load(f)

In [64]:
set(test15['lang'])

{'en', 'es', 'it', 'nl'}

In [45]:
# corpus14['text'] = corpus14['text'].apply(lambda x: BeautifulSoup(x, "lxml").getText())
corpus16['text'] = corpus16['text'].apply(lambda x: BeautifulSoup(x, "lxml").getText())


In [33]:
for text in corpus14[corpus14['gender'] == 'm']['text']:
    print(text)
    break

The Internet Archive is now home to 10 petabytes of data http://zite.to/SpDcpU 
Big Data Right Now: Five Trendy Open Source Technologies http://zite.to/PzvxqJ 
Apple product rumours for 2013 http://zite.to/TnWtsJ 
How to Install WordPress on RackSpace Cloud Server Without Coding http://zite.to/U6Clrw 
My learnings from 500 Startups – the first month http://zite.to/SPKL7N 
First vertical farm opens in Singapore (Wired UK) http://zite.to/TMEJcw 
Amazon’s Aggressive Homepage http://zite.to/WU56Oq 
Google Launches Crisis Map for Hurricane #Sandy http://zite.to/PBR1TQ 
Amazon suit shows Google as public #cloud threat http://zite.to/Rgi0PT 
Tony Robbins: Change Your Words, Change Your Life http://lnkd.in/Nm8y_P 
How Automated Workflows 'Work': A Simple Setup Guide http://zite.to/QQpcnE 
Search Advertising Equates to $100 Million Daily for Google http://zite.to/VZ7hky 
Rackspace versus Amazon: The big data edition http://zite.to/QQqJdy  #cloud
Google launches open source Voter Information Too

In [35]:
with open("all_english_female.txt", "w") as f:
    for c in [corpus14, corpus15, corpus16, corpus17]:
        for text in c[c['gender'] == 'f']['text']:
            f.write("{}\n".format(text))
        
        

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.svm import LinearSVC

training = pd.concat([corpus15, corpus15, corpus16])
training = training[training['lang'] == 'en']
test = corpus17
test = test[test['lang'] == 'en']

# char_vec = TfidfVectorizer(analyzer='char', ngram_range=(2,6))
word_vec = TfidfVectorizer(ngram_range=(1,3), binary=True)

vectorizer = FeatureUnion([
    ('word', word_vec)
    # ('char', char_vec)
])




In [50]:
from collections import Counter
types = Counter([type(x) for x in test['text']])
print("test types: {}".format(types))
types = Counter([type(x) for x in training['text']])
print("training types: {}".format(types))
types = Counter([type(x) for x in training['text'] + test['text']])
print("merged types: {}".format(types))

types = Counter([type(x) for x in pd.concat([training['text'],test['text']])])
print("merged types 2 : {}".format(types))


test types: Counter({<class 'str'>: 3600})
training types: Counter({<class 'str'>: 732})
merged types: Counter({<class 'float'>: 3172, <class 'str'>: 732})
merged types 2 : Counter({<class 'str'>: 4332})


In [51]:
vectorizer.fit(pd.concat([training['text'], test['text']]))

FeatureUnion(n_jobs=1,
       transformer_list=[('word', TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None))],
       transformer_weights=None)

In [None]:
X_train = vectorizer.transform(training['text'])
y_train = training['gender']
X_test = vectorizer.transform(test['text'])
y_test = test['gender']

In [None]:
X_train.shape
print(X_test.shape)

In [None]:
from sklearn.metrics import classification_report
from sklearn.svm import SVC
svm = SVC(kernel='linear', probability=True)
svm.fit(X_train, y_train)
preds = svm.predict(X_test)



In [228]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB(alpha=0.001)
nb.fit(X_train, y_train)
preds2 = nb.predict(X_test)

In [48]:
print(classification_report(y_test, preds))

NameError: name 'classification_report' is not defined

In [None]:
Counter(y_test)

In [None]:
Counter(y_train)

In [None]:
Counter(preds)

In [148]:
probs = svm.predict_proba(X_test)

In [211]:
female_looks_male = [(x[1], i) for i,x in enumerate(probs) if y_test[i]=="f" and x[1] > 0.50]
print(sorted(male_looks_female, reverse=True)[:10])

[(0.89800115850759787, 524), (0.86119659576805552, 918), (0.85948208740317422, 2617), (0.85948208740317378, 774), (0.85319799964626375, 2948), (0.84333002415841807, 2019), (0.83300630655402697, 2256), (0.82832491272958053, 2343), (0.81426756875048423, 498), (0.79851783521847131, 2765)]


In [47]:
print(test.iloc[524])
print(test['text'].iloc[524])

NameError: name 'test' is not defined

In [214]:
print(test['text'].iloc[918])

Embrace your whole self! "The more I like me, the less I want to pretend to be other people." Jamie Lee Curtis #NimbleQuotes
Want to truly master something? "...the way of the Master consists in doing one's best...that is all." Confucius #NimbleQuotes
There is no growth without change. "[Be] strong enough and smart enough to let go and grow." Marc and Angel Chernoff #NimbleQuotes
There is no substitute for wisdom. "There's a beauty to wisdom and experience that cannot be faked." Amy Grant #NimbleQuotes
Will you express yourself? "Now it is your turn." Laura Probert #NimbleQuotes
Keep an open mind. "The soul should always stand ajar; ready to welcome the ecstatic experience." Emily Dickinson #NimbleQuotes
Indulge in learning and thought! "The noblest pleasure is the joy of understanding." Leonardo da Vinci #NimbleQuotes
Progress never stops, life carries on. "Nature does not hurry, yet everything is accomplished." Lao Tzu #NimbleQuotes
Learn more to give more and forever. "Education is 

In [210]:
male_looks_female = [(x[0], i) for i,x in enumerate(probs) if y_test[i]=="m" and x[0] > 0.50]
print(sorted(female_looks_male, reverse=True)[:10])

[(0.85991748437095072, 423), (0.85840328485276374, 2733), (0.81928753196870796, 1809), (0.81142208128169091, 3341), (0.80149033402731817, 1623), (0.77827680328579829, 145), (0.77216773992424448, 790), (0.76754563617773763, 713), (0.7640991539833788, 257), (0.76340026888989143, 2628)]


In [205]:
print(test['text'].iloc[524])

Saturday morning shenanigans with my guy 🐾🐶😍Lots of pups out today playing in the snow 👏🏻👏🏻 Bro… https://t.co/9oCuBCWIZA
On Fridays we dine in Gastown❤️! Papardelle Bolognese for the win 🙌🏻🇲🇽! Friyays are for lunch… https://t.co/7d95eS9G0V
Gastown vibe this #friyay ❄️☃️#vancity #vancitybuzz #gastown #yvr #letitsnow #february #canada @… https://t.co/vyrzPKn7YP
8am #downtown Bro Walk Vibe ☃️❄️. Bundle up. Another #articfront💨 #friday #pug #winter #snow… https://t.co/zgokuabaeT
Albondigas Soup and homemade tortillas...its #whatsfordinner 👌🏻Bringing some real Mexican 🇲🇽 to… https://t.co/0ayu95Cry1
Up on the blog today, we share our favorite spots in Gastown, where you can #live 🌇 #work 🚴🏻💼&amp;… https://t.co/jDyzCLveRK
O P E N  H O U S E 
Join us mkrealestategroup this weekend for an open house Saturday &amp; Sunday… https://t.co/3luvonC1Mw
O P E N H O U S E
Join us mkrealestategroup this weekend, Saturday and Sunday from 2-4pm to view… https://t.co/0HUgOobghk
Love the chic elegance of the

In [None]:
b

In [2]:
import pickle
with open("../../data/pan/corpus17.pickle", "rb") as f:
    corpus17 = pickle.load(f)

In [3]:
en17 = corpus17[corpus17['lang'] == 'en']

In [78]:
fen17 = en17[en17['gender'] == 'f']
men17 = en17[en17['gender'] == 'm']

In [10]:
from spacy.en import English

nlp = English()

In [12]:
corpus17 = corpus17[corpus17['lang'] == 'en']

In [16]:
corpus17['text'] = corpus17['text'].apply(lambda x: "\n".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [19]:
corpus17

Unnamed: 0,author,lang,text,gender,variety
0,1003de26f870d27f79887272a1eb3612,en,One to watch … \nAvailable on 10th Feb. https:...,male,new zealand
1,102cce280df9f6e0e78bfdd266f1abb5,en,Are we living in a holographic universe? New s...,female,canada
2,10488b3700fa9d2db22961ab064e4d38,en,"Museum focus, but still great pieces of advice...",female,new zealand
3,1064bd0b78f14bea5b851e2a995dd4e5,en,Best half time show EVER!\n@jannarden not the...,female,canada
4,106aa0abb81873d09028b01658c37611,en,Does this mean @WaitakereUnited are top of the...,male,new zealand
5,1074ea46e0f2d49b18b4d77b9aa8c9b,en,in the time of chimpanzees I was a monkey\nHow...,male,canada
6,108d0b0b44c53042b7ee4fe3576a8cc9,en,Poetry Shelf The Summer Season: Poets pick poe...,female,new zealand
7,10927a4b7defb6c60d5c4ac254050c2,en,@melindiscott 😂Don't have anything to do with ...,female,great britain
8,10adebacc07508e4e5e921d1ea6fa3b5,en,@IxDAwards I vote for #nzvisagateway for the 2...,male,new zealand
9,10b29c3fa3a80479a726ace913768934,en,@amlozyk But guess if those expectations are v...,male,canada


In [64]:
texts = [x for x in corpus17['text']]

In [27]:
spd = []
for i, text in enumerate(texts):
    if i % 100 == 0:
        print("{} ".format(i), end="")
    spd.append(nlp(text))
    

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 

In [55]:
def get_tags(sp):
    return ' '.join([x.tag_ for x in sp])

ttexts = [get_tags(x) for x in spd]
ys = [0 if g == "female" else 1 for g in corpus17['gender']]

In [56]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [110]:
word_vec = TfidfVectorizer(binary=True, ngram_range=(1,3))
char_vec = TfidfVectorizer(ngram_range=(3,6), analyzer='char')
classifier = LinearSVC()

In [101]:
def tag_text(spacy_text):
    return "{} \n {}".format(spacy_text, ' '.join([word.tag_ for word in spacy_text]))

In [102]:
tagged_texts = [tag_text(text) for text in spd]

In [103]:
tagged_texts[0]



In [111]:
from sklearn.pipeline import FeatureUnion

fu = FeatureUnion([
    ('word', word_vec),
    ('char', char_vec)
])

In [112]:
Xs = fu.fit_transform(tagged_texts)

In [113]:
Xs.shape

(3600, 16585575)

In [114]:
cross_val_score(classifier, Xs, ys, cv=5, n_jobs=-1)
# text + tag (concatenated) array([ 0.77638889,  0.76805556,  0.76944444,  0.75416667,  0.76666667])

array([ 0.775     ,  0.8       ,  0.78888889,  0.76944444,  0.81944444])