In [1]:
import pandas as pd

In [2]:
humanist_vols = pd.read_csv('web_scraped_humanist_listserv.csv')
humanist_vols['cleaned_dates'] = humanist_vols['dates'].str.replace('-', '/')
humanist_vols['cleaned_dates'].str.split('/')
humanist_vols['year_start'] = humanist_vols['cleaned_dates'].str.split('/').str[0]
humanist_vols['year_end'] = humanist_vols['cleaned_dates'].str.split('/').str[1]
humanist_vols['volume_size'] = humanist_vols['text'].str.count('\n')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#save our texts to a list
documents = humanist_vols.text.tolist()

#Create a vectorizer
vectorizer = TfidfVectorizer(max_df=0.7, min_df=1)
transformed_documents = vectorizer.fit_transform(documents)

# Now get the top features for each document
transformed_documents_as_array = transformed_documents.toarray()

dates = humanist_vols.dates.tolist()
tfidf_results = []
for counter, doc in enumerate(transformed_documents_as_array):
    # construct a dataframe
    tf_idf_tuples = list(zip(vectorizer.get_feature_names(), doc))
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
    one_doc_as_df['dates'] = dates[counter]
    tfidf_results.append(one_doc_as_df)
import warnings
warnings.filterwarnings("ignore")
tfidf_df = pd.concat(tfidf_results)
tfidf_df = tfidf_df.sort_values(by=['score'], ascending=False)



In [8]:
top_10_term = tfidf_df.term.unique()[0:10]
top_10_term_subset = tfidf_df[tfidf_df["term"].isin(top_10_term)]

In [4]:
import altair as alt
alt.renderers.enable('default')

RendererRegistry.enable('default')

In [12]:
top_10_term_subset.head()

Unnamed: 0,term,score,dates
0,2004,0.804992,2003-2004
0,2004,0.792938,2004-2005
0,2007,0.777456,2007-2008
0,ninch,0.750433,2000-2001
0,utorepas,0.725871,1987-1988


In [13]:
melted_top_10 = top_10_term_subset.melt(id_vars=['term', 'dates'])
melted_top_10

Unnamed: 0,term,dates,variable,value
0,2004,2003-2004,score,0.804992
1,2004,2004-2005,score,0.792938
2,2007,2007-2008,score,0.777456
3,ninch,2000-2001,score,0.750433
4,utorepas,1987-1988,score,0.725871
...,...,...,...,...
205,2004,1994-1995,score,0.000000
206,ninch,1995-1996,score,0.000000
207,utorepas,1994-1995,score,0.000000
208,ninch,1994-1995,score,0.000000


In [14]:
alt.Chart(melted_top_10).mark_line().encode(
    x='dates',
    y='value',
    color='term',

)

In [33]:
empty_dataframe = pd.DataFrame(columns=tfidf_df.columns)
volumes = tfidf_df.dates.unique()

for volume in volumes:
    volume_subset = tfidf_df[tfidf_df["dates"]==volume][0:5]
    empty_dataframe = empty_dataframe.append(volume_subset, ignore_index = True)

empty_dataframe = empty_dataframe.sort_values(by=['dates'], ascending=True)
empty_dataframe


Unnamed: 0,term,score,dates
24,snobol,0.096907,1987-1988
21,vax,0.192768,1987-1988
22,rahtz,0.121988,1987-1988
23,coombs,0.106646,1987-1988
20,utorepas,0.725871,1987-1988
...,...,...,...
11,2008,0.426284,2007-2008
10,2007,0.777456,2007-2008
13,fludd,0.138820,2007-2008
14,1617,0.094775,2007-2008


In [38]:
melted_volume = empty_dataframe.melt(id_vars=['term', 'dates'])
alt.Chart(melted_volume).mark_point().encode(
    x='dates',
    y='value',
    color='term',
    tooltip=["term"],
).interactive()