In [None]:
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv('/content/event_data.csv')

In [None]:
data.columns

####Top 10 Students by Number of Events Attended

In [None]:
# Count the number of events attended by each Name
event_counts = data['Name'].value_counts()

# Filter to find Names with more than one event attendance
multiple_events = event_counts[event_counts > 1]

# Get details of students who attended more than one event
students_multiple_events = data[data['Name'].isin(multiple_events.index)]

# Get the top 10 students who attended the most events
top10_students = multiple_events.head(10)

# Replace Names with Student 1, Student 2, etc.
top10_students.index = ['Student ' + str(i+1) for i in range(len(top10_students))]

# Create a bar plot
plt.figure(figsize=(10, 6))
top10_students.plot(kind='bar', color='teal')
plt.title('Top 10 Students by Number of Events Attended')
plt.xlabel('Student Number')
plt.ylabel('Number of Events Attended')
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

#### Distribution of Event Attendance

In [None]:
# Count how many times each attendance count occurs (how many students attended exactly 1, 2, 3,... events)
attendance_distribution = event_counts.value_counts()

# Group all attendance counts greater than 5
more_than_5 = attendance_distribution[attendance_distribution.index > 5].sum()
attendance_distribution = attendance_distribution[attendance_distribution.index <= 5]

# Manually add the "More than 5" category
attendance_distribution.loc['More than 5'] = more_than_5

# Create a pie chart of event attendance
plt.figure(figsize=(10, 10))
plt.pie(attendance_distribution.values, labels=attendance_distribution.index, autopct='%1.1f%%', startangle=140, colors=plt.cm.Paired.colors, textprops={'fontsize': 12})

plt.title('Distribution of Event Attendance')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Assuming 'event_counts' has already been defined and contains the number of events each student attended

# Count how many times each attendance count occurs (how many students attended exactly 1, 2, 3,... events)
attendance_distribution = event_counts.value_counts()

# Group all attendance counts greater than 15
more_than_15 = attendance_distribution[attendance_distribution.index > 15].sum()
attendance_distribution = attendance_distribution[attendance_distribution.index <= 15]

# Manually add the "More than 15" category
attendance_distribution.loc['>15'] = more_than_15

# Create a histogram of event attendance
plt.figure(figsize=(10, 6))
# Convert index to string and use it as x labels, while plotting against a range of numbers
bars = plt.bar(range(len(attendance_distribution)), attendance_distribution.values, color='skyblue', alpha=0.7)

# Add text annotations above each bar
for i, bar in enumerate(bars):
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, int(yval), va='bottom')  # va: vertical alignment

plt.title('Distribution of Event Attendance')
plt.xlabel('Number of Events Attended')
plt.ylabel('Number of Students')
plt.xticks(range(len(attendance_distribution)), attendance_distribution.index)  # Set custom x-axis labels
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Assuming 'event_counts' has already been defined and contains the number of events each student attended

# Count how many times each attendance count occurs (how many students attended exactly 1, 2, 3,... events)
attendance_distribution = event_counts.value_counts()

# Group all attendance counts greater than 15
more_than_15 = attendance_distribution[attendance_distribution.index > 15].sum()
attendance_distribution = attendance_distribution[attendance_distribution.index <= 15]

# Manually add the "More than 15" category
attendance_distribution.loc['>15'] = more_than_15

# Remove the bar for 1 event attended
attendance_distribution = attendance_distribution[attendance_distribution.index != 1]

# Create a histogram of event attendance
plt.figure(figsize=(10, 6))
# Convert index to string and use it as x labels, while plotting against a range of numbers
bars = plt.bar(range(len(attendance_distribution)), attendance_distribution.values, color='skyblue', alpha=0.7)

# Add text annotations above each bar
for i, bar in enumerate(bars):
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, int(yval), va='bottom')  # va: vertical alignment

plt.title('Distribution of Event Attendance Excluding Single Event Attendance')
plt.xlabel('Number of Events Attended')
plt.ylabel('Number of Students')
plt.xticks(range(len(attendance_distribution)), attendance_distribution.index)  # Set custom x-axis labels
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.show()

#### List of Events Attended by the Top 15 Students

In [None]:
# Get the top 15 students based on the number of events attended
top15_students = event_counts.nlargest(15).index

# Filter the original data to include only the top 15 students
top15_students_data = data[data['Name'].isin(top15_students)]

# Create a dictionary to store the events and dates attended by each top student
student_events = top15_students_data.groupby('Name').apply(
    lambda x: list(zip(x['Event'], x['Date']))
).to_dict()

# Print the list of events and dates attended by each top student
for student, events in student_events.items():
    print(f"{student} attended the following events:")
    for event, date in events:
        print(f" - {event} on {date}")
    print("\n")  # Adds a newline for better separation between students

Pretty table version

In [None]:
!pip install prettytable

In [None]:
import pandas as pd
from prettytable import PrettyTable

# Function to truncate long strings
def truncate_string(s, length=30):
    return s if len(s) <= length else s[:27] + '...'

# Aggregate the data to count the number of each event attended by each student (by name)
event_counts_table = data.groupby(['Name', 'Event']).size().reset_index(name='Count')

# Truncate long event names and format the events and counts into a single string per student
event_counts_table['Event'] = event_counts_table['Event'].apply(truncate_string) + ': ' + event_counts_table['Count'].astype(str)
grouped_events = event_counts_table.groupby('Name')['Event'].apply(lambda x: '\n'.join(x)).reset_index(name='Events')

# Summarize to find the total events attended by each student
total_events = event_counts_table.groupby('Name')['Count'].sum().reset_index(name='Total')

# Merge the detailed events with the total
final_table = pd.merge(grouped_events, total_events, on='Name')

# Sort the students by the total number of events attended in descending order and take the top 10
final_table = final_table.sort_values(by='Total', ascending=False).head(10)

# Add a position column
final_table.insert(0, 'Position', range(1, 1 + len(final_table)))

# Creating a PrettyTable
pretty_table = PrettyTable()
pretty_table.field_names = final_table.columns.tolist()  # Set the table headers to the DataFrame column names

# Adding rows from the DataFrame to the pretty table
for index, row in final_table.iterrows():
    pretty_table.add_row(row)
    # Add an empty row with the correct number of columns
    pretty_table.add_row([''] * len(final_table.columns))

# Convert to string and replace empty lines with a newline for visual clarity
table_string = pretty_table.get_string()
table_string = table_string.replace('\n \n', '\n')  # Remove any blank lines that may look too spaced

# Print the final table with spaces
print(table_string)


#### Hacker Nation Attendance

In [None]:
# Convert 'Date' column to datetime format
#print(typeof(data['Date']))
data['Date'] = pd.to_datetime(data['Date'], format='mixed')

# Sort data by 'Name' and 'Date'
sorted_data = data.sort_values(by=['Name', 'Date'])

# Get the first event attended by each student
first_events = sorted_data.groupby('Name').first().reset_index()

# Count the number of students whose first event was "Hacker Nation" versus others
hacker_nation_count = first_events[first_events['Event'] == 'HN'].shape[0]
other_events_count = first_events[first_events['Event'] != 'HN'].shape[0]

# Create a pie chart
labels = ['Hacker Nation', 'Other Events']
sizes = [hacker_nation_count, other_events_count]
colors = ['skyblue', 'lightcoral']
explode = (0.1, 0)  # explode the first slice

plt.figure(figsize=(8, 8))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',
        shadow=True, startangle=140)

plt.title('Percentage of Students Whose First Event was "Hacker Nation" vs Other Events')
plt.show()

In [None]:
# Count the number of events attended by each student
event_counts_sorted = sorted_data['Name'].value_counts()

# Filter students who attended 2 or more events
students_2_or_more_events = event_counts_sorted[event_counts_sorted >= 2].index

# Filter the original data to include only these students
students_data_sorted = sorted_data[sorted_data['Name'].isin(students_2_or_more_events)]

# Get the first event attended by each student
first_events_students = students_data_sorted.groupby('Name').first().reset_index()

# Count the number of students whose first event was "Hacker Nation"
hacker_nation_first_count = first_events_students[first_events_students['Event'] == 'HN'].shape[0]

# Count the number of students whose first event was not "Hacker Nation"
other_first_count = first_events_students[first_events_students['Event'] != 'HN'].shape[0]

# Create a bar chart
labels = ['Hacker Nation First', 'Other Event First']
counts = [hacker_nation_first_count, other_first_count]
colors = ['skyblue', 'lightcoral']

plt.figure(figsize=(10, 6))
bars = plt.bar(labels, counts, color=colors, alpha=0.7)

# Add text annotations above each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, int(yval), va='bottom', ha='center')  # va: vertical alignment, ha: horizontal alignment

plt.title('Students Attending 2 or More Events: First Event Attendance')
plt.xlabel('First Event')
plt.ylabel('Number of Students')
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.show()


In [None]:
# Filter students who attended 2 or more events
students_2_or_more_events = event_counts[event_counts >= 2].index

# Filter the original data to include only these students
students_data = data[data['Name'].isin(students_2_or_more_events)]

# Check if these students attended "Hacker Nation" at least once
hacker_nation_attendees = students_data[students_data['Event'] == 'HN']['Name'].unique()

# Count the number of students who attended "Hacker Nation" at least once
hacker_nation_count = len(hacker_nation_attendees)

# Count the number of students who attended 2 or more events but not "Hacker Nation"
no_hacker_nation_count = len(students_2_or_more_events) - hacker_nation_count

# Create a bar chart
labels = ['Attended Hacker Nation', 'Did Not Attend Hacker Nation']
counts = [hacker_nation_count, no_hacker_nation_count]
colors = ['skyblue', 'lightcoral']

plt.figure(figsize=(10, 6))
bars = plt.bar(labels, counts, color=colors, alpha=0.7)

# Add text annotations above each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, int(yval), va='bottom', ha='center')

plt.title('Students Attending 2 or More Events: Attendance of "Hacker Nation"')
plt.xlabel('Attendance')
plt.ylabel('Number of Students')
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.show()

#### Attendance By Event

In [None]:
# Count the number of attendees for each event
event_attendance = data['Event'].value_counts()

# Create a bar chart of the events with the highest attendance
plt.figure(figsize=(12, 8))
bars = plt.bar(event_attendance.index, event_attendance.values, color='skyblue', alpha=0.7)

# Add text annotations above each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, int(yval), va='bottom', ha='center')

plt.title('Events with the Highest Attendance')
plt.xlabel('Event')
plt.ylabel('Number of Attendees')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.show()


#### Full Event Data Analysis

questions
- are people signing up for programs the same people going to events?
> if so, what events are they going to?

- who are the people engaging with the programs?

In [None]:
event_data = pd.read_csv('/content/event_data.csv')
program_data = pd.read_csv('/content/program_data.csv')

In [None]:
event_data.columns

In [None]:
program_data.columns

Students that attended both programs and events

In [None]:
# Extract unique names from both dataframes
unique_event_names = set(event_data['Name'].dropna().unique())
unique_program_names = set(program_data['Name'].dropna().unique())

# Find the intersection of both sets to identify common names
common_names = unique_event_names.intersection(unique_program_names)

# Display the number of common names
print(f"Number of students in both datasets: {len(common_names)}\n")
print("Names of students who attended both events and programs:\n")

# Print each name on a new line
for name in common_names:
    print(name)

In [None]:
# Total number of unique students in program_data
total_program_students = len(unique_program_names)
# Number of program students who also attended events
common_students_count = len(common_names)

# Percentage calculation
percentage_common = (common_students_count / total_program_students) * 100

# Creating the chart
fig, ax = plt.subplots()
ax.bar(['Total Program Students', 'Also Attended Events'], [total_program_students, common_students_count], color=['blue', 'green'])
ax.set_ylabel('Number of Students')
ax.set_title('Program Participation and Event Attendance')

# Adding percentage text
ax.text(1, common_students_count, f'{percentage_common:.2f}%', ha='center', va='bottom', color='black')

plt.show()

Trying to see whether people attend events or programs first (no data on the dates for programs)

In [None]:
# Filter the dataframes to include only names and dates
event_data_filtered = event_data[['Name', 'Date']]
program_data_filtered = program_data[['Name', 'Date']]

# Extract unique names from both dataframes
unique_event_names = set(event_data['Name'].dropna().unique())
unique_program_names = set(program_data['Name'].dropna().unique())

# Find intersection of both sets to identify common names
common_names = unique_event_names.intersection(unique_program_names)

# Create a dataframe for students who attended both events and programs
common_names_df = pd.DataFrame(list(common_names), columns=['Name'])

# Merge to get dates
common_events = pd.merge(common_names_df, event_data_filtered, on='Name', how='left')
common_programs = pd.merge(common_names_df, program_data_filtered, on='Name', how='left')

# !!! Print the dates (if/when the data exists)

In [None]:
# Filter the dataframes to include only relevant columns
event_data_filtered = event_data[['Name', 'Event']]
program_data_filtered = program_data[['Name', 'Program']]

# Create a dataframe for students who attended both events and programs
common_names_df = pd.DataFrame(list(common_names), columns=['Name'])

# Merge to get event and program names
common_events = pd.merge(common_names_df, event_data_filtered, on='Name', how='left')
common_programs = pd.merge(common_names_df, program_data_filtered, on='Name', how='left')

# Set pandas to display all rows temporarily
pd.set_option('display.max_rows', None)

# Display the data
print("Events attended by students who also attended programs:")
print(common_events)
print("\nPrograms attended by students who also attended events:")
print(common_programs)

# Reset the display option to default to avoid slowing down your notebook in future operations
pd.reset_option('display.max_rows')

In [None]:
# Group all non-"HN" events into an "Other Events" category
event_counts = common_events['Event'].value_counts()
other_events_sum = event_counts[event_counts.index != 'HN'].sum()
event_counts = event_counts[event_counts.index == 'HN']
event_counts['Other Events'] = other_events_sum

# Rename "HN" to "Hacker Nation"
event_counts.index = ['Hacker Nation' if x == 'HN' else x for x in event_counts.index]

# Plotting the pie chart
plt.figure(figsize=(10, 8))
plt.pie(event_counts, labels=event_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Events Attended by Students Who Also Attended a Program')
plt.axis('equal')

plt.show()

In [None]:
# Count the occurrence of each program
program_counts = common_programs['Program'].value_counts()

# Plotting the pie chart
plt.figure(figsize=(10, 8))
plt.pie(program_counts, labels=program_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Programs Attended by Participants That Also Attended Events')
plt.axis('equal')

plt.show()

In [None]:
# Filter the event_data to find attendees of the "HN" event
hn_attendees = event_data[event_data['Event'] == 'HN']['Name'].unique()

# Filter the program_data to find all unique names
program_attendees = program_data['Name'].unique()

# Find the intersection of hn_attendees and program_attendees
hn_program_common = [name for name in hn_attendees if name in program_attendees]

# Calculate the percentage
percentage = (len(hn_program_common) / len(hn_attendees)) * 100

# Print the result
print(f"Out of the students that attended the 'HN' event, {percentage:.2f}% of them appear in the program data.")

In [None]:
# Filter the event_data to find attendees of the "HN" event
hn_attendees = event_data[event_data['Event'] == 'HN']['Name'].unique()

# Filter the program_data to find all unique names
program_attendees = program_data['Name'].unique()

# Find the intersection of program_attendees and hn_attendees
program_hn_common = [name for name in program_attendees if name in hn_attendees]

# Calculate the percentage
percentage = (len(program_hn_common) / len(program_attendees)) * 100

# Print the result
print(f"Out of the students that appear in the program data, {percentage:.2f}% of them attended the 'HN' event.")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the datasets
program_data = pd.read_csv('/content/program_data.csv')
event_data = pd.read_csv('/content/event_data.csv')

# Data cleaning
program_data['Name'] = program_data['Name'].str.strip().str.title()
event_data['Name'] = event_data['Name'].str.strip().str.title()

# Convert date strings to datetime objects
program_data['Date'] = pd.to_datetime(program_data['Date'])
event_data['Date'] = pd.to_datetime(event_data['Date'], errors='coerce')  # Handles dates without times

# Merge datasets on Name
merged_data = pd.merge(program_data, event_data, on='Name', suffixes=('_program', '_event'))

# Check for number of matches
print(f"Total matches found: {len(merged_data)}")

# For each student, find the earliest event date
merged_data['First Event Date'] = merged_data.groupby('Name')['Date_event'].transform('min')

# Determine if the first event date is before the program date
merged_data['First Event Before Program'] = merged_data['First Event Date'] < merged_data['Date_program']

# Group by Name and check if any entry per student has an event before the program
first_event_before_program = merged_data.groupby('Name')['First Event Before Program'].any()

# Calculate the percentage of students who attended at least one event before their program
percent_before = first_event_before_program.mean() * 100

# Print the percentage
print(f"Percentage of students who attended at least one event before their program: {percent_before:.2f}%")

# Optionally, display detailed data for each student
detailed_data = merged_data[['Name', 'Date_program', 'Program', 'First Event Date', 'First Event Before Program']].drop_duplicates()
print(detailed_data.head(10))

#### Conclusions

- people joining programs are usually not the same ones attending events
- Only around 20% of people have both attended an event and joined a program
- when they do, its almost always HN
- nearly all people that attend more than one event attend HN -> but they don't necessarily attend one first and then the other

- there is no way to know whether HN encourages students to apply for programs, or whether students that have attended programs are more likely to be interested in HN (there is no date information for the programs)

- but still, only a small amount of HN attendees go on to participate in programs (4.59%), and a small amount of people that participate in programs are part of HN (12.5%).
> there is not a lot of overlap between events and conversions to programs

> hacker nation is the event that makes people return (or the event that people who return like the most)


to do:
- check info on people in the programs list (undergrads/faculty/grad students) -> are they more likely to be faculty/phds?