# Descriptive Statistics and Visualizations for Dataset

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'final_test_data_set.pkl'
dataset = pd.read_pickle(file_path)

# Display basic info
dataset.info()
dataset.head()

## Descriptive Statistics

In [None]:
# Generate descriptive statistics
descriptive_stats = dataset.describe(include='all')
print(descriptive_stats)

### General Statistics

In [None]:
# Count number of unique questions
unique_questions = dataset['user_input'].nunique()

# Average number of contexts per question
average_contexts = dataset['reference_contexts'].apply(lambda x: len(x)).mean()

# Average context length in characters
average_context_length = dataset['reference_contexts'].apply(lambda x: sum(len(ctx) for ctx in x) / len(x)).mean()

# Average answer length in characters
average_answer_length = dataset['reference'].apply(len).mean()

# Display summary statistics
summary_stats = pd.DataFrame({
    'Number of Unique Questions': [unique_questions],
    'Average Contexts per Question': [average_contexts],
    'Average Context Length (characters)': [average_context_length],
    'Average Answer Length (characters)': [average_answer_length]
})

print(summary_stats)

## Visualizations

### Distribution of User Input Lengths

In [None]:
# Generate user input length histogram
user_input_lengths = dataset['user_input'].apply(len)

plt.figure(figsize=(10, 6))
plt.hist(user_input_lengths, bins=20, edgecolor='black', color='#FFA500')
plt.title('Distribution of User Input Lengths')
plt.xlabel('Length of User Input')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

### Distribution of Topics

In [None]:
# Calculate topic distribution
topic_counts = dataset['topics'].value_counts()

plt.figure(figsize=(10, 6))
topic_counts.plot(kind='bar', color='#FFA500', edgecolor='black')
plt.title('Distribution of Topics')
plt.xlabel('Topics')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### Distribution of Single-Hop vs Multi-Hop Queries

In [None]:
# Synthesizer Name distribution
synth_counts = dataset['synthesizer_name'].value_counts()

plt.figure(figsize=(10, 6))
synth_counts.plot(kind='bar', color='#FFA500', edgecolor='black')
plt.title('Distribution of Single-Hop vs Multi-Hop')
plt.ylabel('Count')
plt.xlabel('Synthesizer Name')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### Distribution of Languages

In [None]:
# Language distribution
language_counts = dataset['language'].value_counts()

plt.figure(figsize=(10, 6))
language_counts.plot(kind='bar', color='#FFA500', edgecolor='black')
plt.title('Distribution of Languages')
plt.ylabel('Count')
plt.xlabel('Language')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

### Final Remarks

This notebook summarizes the descriptive statistics and visualizations for the dataset, covering user input lengths, topic distributions, query types, and language distributions.