In [1]:
# TEDtalks: Topics with LDA

# =-=-=-=-=-=
# Read CSV into DataFrame and then create lists
# =-=-=-=-=-=

import pandas
import re


# Create pandas dataframe
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_3a.csv', names=colnames)

# Create lists for the data
talks = df.text.tolist()
authors = df.author.tolist()
dates = df.date.tolist()

# Get years & combine with author
years = [re.sub('[A-Za-z ]', '', item) for item in dates]
authordate = [author+" "+year for author, year in zip(authors, years)]

# Just to check to see if things are synced,
# let's create a new df with the two lists.

citations = pandas.DataFrame(
    {'citation': authordate,
     'text': talks,
    })

In [2]:
# First, replace the first item with the correct label for the new column
authordate[0] = "citation"

# Second, convert the list to a series
citations = pandas.Series(authordate) 

# Third, add to dataframe above
df['citation'] = citations.values

# df.head(10)

# Set as index?
indexed = df.set_index('citation')
indexed.head()

# Save the indexed dataframe
indexed.to_csv('../data/talks_4.csv')

In [2]:
# Load the revised dataframe
import pandas
main_df = pandas.read_csv('../data/talks_4.csv').set_index('citation')

main_df.head()

Unnamed: 0_level_0,author,title,date,length,text
citation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Al Gore 2006,Al Gore,Averting the climate crisis,Jun 2006,957,Thank you so much Chris. And it's truly a gre...
David Pogue 2006,David Pogue,Simplicity sells,Jun 2006,1271,Hello voice mail my old friend. I've called f...
Cameron Sinclair 2006,Cameron Sinclair,My wish: A call for open-source architecture,Jul 2006,1398,I'm going to take you on a journey very quickl...
Sergey Brin + Larry Page 2007,Sergey Brin + Larry Page,The genesis of Google,May 2007,1205,Sergey Brin I want to discuss a question I kn...
Nathalie Miebach 2011,Nathalie Miebach,Art made of storms,Oct 2011,247,What you just heard are the interactions of ba...


The `speakers-gender.csv` has some non-standard characters in it. I need to figure out how to read it without stumbling on those speakers. 

It turns out that by setting the encoding to `latin-1`, the file was read into a dataframe with no problem. 

In [None]:
# =-=-=-=-=-=-=-=-=-=-=
# Create a clean version of genders dataframe
# =-=-=-=-=-=-=-=-=-=-= 

# Load the genders CSV

# colnames = ['Name','Occupation','ShortDescription','LongDescription','Gender',
# 'MaleScore','FemaleScore','NonBinaryScore','Gender + hand codes']

spkr_gndr = pandas.read_csv('../data/speakers-gender.csv', encoding='latin-1')
# spkr_gndr.head()

# Simplify the dataframe by dropping columns (especially that long description)

# sg2 = spkr_gndr.drop('LongDescription', axis=1)
# sg3 = sg2.drop('MaleScore', axis = 1)
# sg4 = sg3.drop('FemaleScore', axis = 1)
# sg5 = sg4.drop('NonBinaryScore', axis = 1)
# sg6 = sg5.drop('Gender', axis = 1)
# sg7 = sg6.drop('Unnamed: 9', axis = 1)

genders = sg7
# sg7.columns['name', 'occupation', 'description', 'gender'] # Didn't work.
# So I'm going to save the file to CSV, edit the headers, and then read it back in. 

genders.to_csv('../data/genders.csv')

In [4]:
# =-=-=-=-=-=-=-=-=-=-=
# Add genders to main dataframe
# =-=-=-=-=-=-=-=-=-=-= 

main_df = pandas.read_csv('../data/talks_4.csv')
gender_df = pandas.read_csv('../data/genders.csv')

In [5]:
gender_df.head()

Unnamed: 0,author,occupation,description,gender
0,Abha Dawesar,Novelist,Abha Dawesar writes to make sense of the world...,female
1,Abigail Washburn,Clawhammer banjo player,Abigail Washburn pairs venerable folk elements...,female
2,Achenyo Idachaba,Green entrepreneur,"Achenyo Idachaba is the head of MitiMeth, a Ni...",female
3,Aditi Gupta,"Social entrepreneur, co-founder of Menstrupedia",Aditi Gupta uses storytelling and art to educa...,female
4,Aditi Shankardass,Neuroscientist,Aditi Shankardass is pioneering the use of EEG...,female


In [6]:
# Works!
test_1 = main_df.merge(gender_df[['author', 'gender']], 'left')
test_1.head(20)

Unnamed: 0,citation,author,title,date,length,text,gender
0,Al Gore 2006,Al Gore,Averting the climate crisis,Jun 2006,957,Thank you so much Chris. And it's truly a gre...,male
1,David Pogue 2006,David Pogue,Simplicity sells,Jun 2006,1271,Hello voice mail my old friend. I've called f...,male
2,Cameron Sinclair 2006,Cameron Sinclair,My wish: A call for open-source architecture,Jul 2006,1398,I'm going to take you on a journey very quickl...,male
3,Sergey Brin + Larry Page 2007,Sergey Brin + Larry Page,The genesis of Google,May 2007,1205,Sergey Brin I want to discuss a question I kn...,
4,Nathalie Miebach 2011,Nathalie Miebach,Art made of storms,Oct 2011,247,What you just heard are the interactions of ba...,female
5,Richard Wilkinson 2011,Richard Wilkinson,How economic inequality harms societies,Oct 2011,998,You all know the truth of what I'm going to sa...,male
6,Malcolm Gladwell 2011,Malcolm Gladwell,The strange tale of the Norden bombsight,Oct 2011,883,Thank you. It's a real pleasure to be here. I ...,male
7,Jay Bradner 2011,Jay Bradner,Open-source cancer research,Oct 2011,752,I moved to Boston years ago from Chicago ...,male
8,Béatrice Coron 2011,Béatrice Coron,Stories cut from paper,Oct 2011,1077,I am a papercutter. I cut stories. So my proce...,
9,Hasan Elahi 2011,Hasan Elahi,"FBI, here I am!",Oct 2011,858,Hi there. I'm Hasan. I'm an artist. And usuall...,male


In [7]:
test_1.to_csv('../data/talks_5.csv')

Of the remaining methods below, I like this simple one-line code the best. It also strikes me as the most readable, the one where I can see what is going to happen:

    test_5 = main_df.join(gender_df.set_index('author').gender, on='author')
    
For the record, here are the other methods:

```python

# Works
d = dict(gender_df[['author', 'gender']].values)
test_2 = main_df.assign(gender=main_df.author.map(d))
test_2.head()

# I couldn't get this to work
d = DF2.set_index('author').gender
DF1.assign(gender=DF1.author.map(d))

# Works
d = gender_df.set_index('author').gender
test_3 = main_df.assign(gender=main_df.author.map(d))
test_3.head()

# Works
d = dict(zip(gender_df.author, gender_df.gender))
test_4 = main_df.assign(gender=main_df.author.map(d))
test_4.head()

# Works
test_5 = main_df.join(gender_df.set_index('author').gender, on='author')
test_5.head()
```

[Many thanks to SO's piRSquared](https://stackoverflow.com/questions/46498050/matching-data-across-pandas-dataframes/).

In [8]:
occupations = test_1.join(gender_df.set_index('author').occupation, on='author')

In [10]:
occupations.head()

Unnamed: 0,citation,author,title,date,length,text,gender,occupation
0,Al Gore 2006,Al Gore,Averting the climate crisis,Jun 2006,957,Thank you so much Chris. And it's truly a gre...,male,Climate advocate
1,David Pogue 2006,David Pogue,Simplicity sells,Jun 2006,1271,Hello voice mail my old friend. I've called f...,male,Technology columnist
2,Cameron Sinclair 2006,Cameron Sinclair,My wish: A call for open-source architecture,Jul 2006,1398,I'm going to take you on a journey very quickl...,male,"Co-founder, Architecture for Humanity"
3,Sergey Brin + Larry Page 2007,Sergey Brin + Larry Page,The genesis of Google,May 2007,1205,Sergey Brin I want to discuss a question I kn...,,
4,Nathalie Miebach 2011,Nathalie Miebach,Art made of storms,Oct 2011,247,What you just heard are the interactions of ba...,female,Artist


In [11]:
occupations.to_csv('../data/talks_6.csv')