In [8]:
import pandas as pd

data_path = 'web_scraped_humanist_listserv_volumes.csv'
humanist_data = pd.read_csv(data_path)

print(humanist_data.info())
humanist_data.head()
humanist_data.columns.tolist()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   volume_text          33 non-null     object
 1   volume_link          33 non-null     object
 2   volume_dates         33 non-null     object
 3   volume_number        33 non-null     int64 
 4   inferred_start_year  33 non-null     int64 
 5   inferred_end_year    33 non-null     int64 
dtypes: int64(3), object(3)
memory usage: 1.7+ KB
None


['volume_text',
 'volume_link',
 'volume_dates',
 'volume_number',
 'inferred_start_year',
 'inferred_end_year']

In [6]:
# Filter data by year to create subsets for the two eras
early_internet = humanist_data[humanist_data['inferred_start_year'] < 2000]
web_2_0 = humanist_data[humanist_data['inferred_start_year'] >= 2000]


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Function to preprocess and vectorize text
def tfidf_analysis(data):
    vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(data['volume_text'])
    feature_names = vectorizer.get_feature_names_out()
    dense = tfidf_matrix.todense()
    denselist = dense.tolist()
    df = pd.DataFrame(denselist, columns=feature_names)
    return df

# Apply TF-IDF to both subsets
tfidf_early = tfidf_analysis(early_internet)
tfidf_web_2_0 = tfidf_analysis(web_2_0)


In [10]:
import altair as alt

# Function to create a plot for top terms
def plot_top_terms(tfidf_df, title):
    top_terms = tfidf_df.mean().nlargest(10).reset_index()
    top_terms.columns = ['term', 'score']
    return alt.Chart(top_terms).mark_bar().encode(
        x='score',
        y=alt.Y('term', sort='-x'),
        tooltip=['term', 'score']
    ).properties(title=title)

# Create plots for both eras
plot_early = plot_top_terms(tfidf_early, 'Top Terms in Early Internet Era')
plot_web_2_0 = plot_top_terms(tfidf_web_2_0, 'Top Terms in Web 2.0 Era')

# Display the plots
plot_early | plot_web_2_0
