In [2]:
# TEDtalks: Topics with LDA

# =-=-=-=-=-=
# Read CSV into DataFrame and then create lists
# =-=-=-=-=-=

import pandas
import re


# Create pandas dataframe
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_3a.csv', names=colnames)

# Create lists for the data
talks = df.text.tolist()
authors = df.author.tolist()
dates = df.date.tolist()

# Get years & combine with author
years = [re.sub('[A-Za-z ]', '', item) for item in dates]
authordate = [author+" "+year for author, year in zip(authors, years)]

# Just to check to see if things are synced,
# let's create a new df with the two lists.

citations = pandas.DataFrame(
    {'citation': authordate,
     'text': talks,
    })

In [None]:
# First, replace the first item with the correct label for the new column
authordate[0] = "citation"

In [12]:
# Second, convert the list to a series
citations = pandas.Series(authordate) 

In [14]:
# Third, add to dataframe above
df['citation'] = citations.values

# df.head(10)

In [19]:
# Set as index?
indexed = df.set_index('citation')
indexed.head()

Unnamed: 0_level_0,author,title,date,length,text
citation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
citation,Author,Title,Date,Length,Text
Al Gore 2006,Al Gore,Averting the climate crisis,Jun 2006,957,Thank you so much Chris. And it's truly a gre...
David Pogue 2006,David Pogue,Simplicity sells,Jun 2006,1271,Hello voice mail my old friend. I've called f...
Cameron Sinclair 2006,Cameron Sinclair,My wish: A call for open-source architecture,Jul 2006,1398,I'm going to take you on a journey very quickl...
Sergey Brin + Larry Page 2007,Sergey Brin + Larry Page,The genesis of Google,May 2007,1205,Sergey Brin I want to discuss a question I kn...


The `speakers-gender.csv` has some non-standard characters in it. I need to figure out how to read it without stumbling on those speakers. 

It turns out that by setting the encoding to `latin-1`, the file was read into a dataframe with no problem. 

In [28]:
# Load the genders CSV

# colnames = ['Name','Occupation','ShortDescription','LongDescription','Gender',
# 'MaleScore','FemaleScore','NonBinaryScore','Gender + hand codes']

spkr_gndr = pandas.read_csv('../data/speakers-gender.csv', encoding='latin-1')

In [51]:
spkr_gndr.head()

Unnamed: 0,Name,Occupation,ShortDescription,LongDescription,Gender,MaleScore,FemaleScore,NonBinaryScore,Gender + hand codes,Unnamed: 9
0,Abha Dawesar,Novelist,Abha Dawesar writes to make sense of the world...,Why you should listen\rAbha Dawesar began her ...,female,0,10,0,female,
1,Abigail Washburn,Clawhammer banjo player,Abigail Washburn pairs venerable folk elements...,Why you should listen\rIf American old-time mu...,female,0,5,0,female,
2,Achenyo Idachaba,Green entrepreneur,"Achenyo Idachaba is the head of MitiMeth, a Ni...","Why you should listen\rIn 2009, Achenyo Idacha...",female,0,4,0,female,
3,Aditi Gupta,"Social entrepreneur, co-founder of Menstrupedia",Aditi Gupta uses storytelling and art to educa...,Why you should listen\rAditi Gupta is a social...,female,0,6,0,female,
4,Aditi Shankardass,Neuroscientist,Aditi Shankardass is pioneering the use of EEG...,Why you should listen\rAditi Shankardass is a ...,female,0,3,1,female,


In [46]:
# Simplify the dataframe by dropping columns (especially that long description)

# sg2 = spkr_gndr.drop('LongDescription', axis=1)
# sg3 = sg2.drop('MaleScore', axis = 1)
# sg4 = sg3.drop('FemaleScore', axis = 1)
# sg5 = sg4.drop('NonBinaryScore', axis = 1)
# sg6 = sg5.drop('Gender', axis = 1)
# sg7 = sg6.drop('Unnamed: 9', axis = 1)

genders = sg7
# sg7.columns['name', 'occupation', 'description', 'gender'] # Didn't work.
# So I'm going to save the file to CSV, edit the headers, and then read it back in. 

In [47]:
genders.head()

Unnamed: 0,Name,Occupation,ShortDescription,Gender + hand codes
0,Abha Dawesar,Novelist,Abha Dawesar writes to make sense of the world...,female
1,Abigail Washburn,Clawhammer banjo player,Abigail Washburn pairs venerable folk elements...,female
2,Achenyo Idachaba,Green entrepreneur,"Achenyo Idachaba is the head of MitiMeth, a Ni...",female
3,Aditi Gupta,"Social entrepreneur, co-founder of Menstrupedia",Aditi Gupta uses storytelling and art to educa...,female
4,Aditi Shankardass,Neuroscientist,Aditi Shankardass is pioneering the use of EEG...,female


In [48]:
genders.to_csv('../data/genders.csv')

In [49]:
gender_df = pandas.read_csv('../data/genders.csv')

In [50]:
gender_df.head()

Unnamed: 0.1,Unnamed: 0,Name,Occupation,ShortDescription,Gender
0,0,Abha Dawesar,Novelist,Abha Dawesar writes to make sense of the world...,female
1,1,Abigail Washburn,Clawhammer banjo player,Abigail Washburn pairs venerable folk elements...,female
2,2,Achenyo Idachaba,Green entrepreneur,"Achenyo Idachaba is the head of MitiMeth, a Ni...",female
3,3,Aditi Gupta,"Social entrepreneur, co-founder of Menstrupedia",Aditi Gupta uses storytelling and art to educa...,female
4,4,Aditi Shankardass,Neuroscientist,Aditi Shankardass is pioneering the use of EEG...,female


In [52]:
gender_df2 = gender_df.drop('Unnamed: 0', axis = 1)

In [53]:
gender_df2.head()

Unnamed: 0,Name,Occupation,ShortDescription,Gender
0,Abha Dawesar,Novelist,Abha Dawesar writes to make sense of the world...,female
1,Abigail Washburn,Clawhammer banjo player,Abigail Washburn pairs venerable folk elements...,female
2,Achenyo Idachaba,Green entrepreneur,"Achenyo Idachaba is the head of MitiMeth, a Ni...",female
3,Aditi Gupta,"Social entrepreneur, co-founder of Menstrupedia",Aditi Gupta uses storytelling and art to educa...,female
4,Aditi Shankardass,Neuroscientist,Aditi Shankardass is pioneering the use of EEG...,female


In [54]:
genders = gender_df2.set_index('Name')

In [55]:
genders.head()

Unnamed: 0_level_0,Occupation,ShortDescription,Gender
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Abha Dawesar,Novelist,Abha Dawesar writes to make sense of the world...,female
Abigail Washburn,Clawhammer banjo player,Abigail Washburn pairs venerable folk elements...,female
Achenyo Idachaba,Green entrepreneur,"Achenyo Idachaba is the head of MitiMeth, a Ni...",female
Aditi Gupta,"Social entrepreneur, co-founder of Menstrupedia",Aditi Gupta uses storytelling and art to educa...,female
Aditi Shankardass,Neuroscientist,Aditi Shankardass is pioneering the use of EEG...,female


In [56]:
genders.to_csv('../data/genders.csv')