# Data Exploration of RockYou Dataset

In this notebook, we will explore the RockYou password dataset to understand its structure, visualize patterns, and prepare for training our models.

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style='whitegrid')

In [None]:
# Load the RockYou dataset
rockyou_path = '../data/raw/rockyou.txt'
with open(rockyou_path, 'r', encoding='latin-1') as file:
    passwords = file.readlines()

# Clean the passwords
passwords = [pwd.strip() for pwd in passwords if pwd.strip()]
print(f'Total passwords loaded: {len(passwords)}')

In [None]:
# Visualize password length distribution
password_lengths = [len(pwd) for pwd in passwords]

plt.figure(figsize=(12, 6))
sns.histplot(password_lengths, bins=30, kde=True)
plt.title('Password Length Distribution')
plt.xlabel('Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Analyze common characters in passwords
from collections import Counter

all_chars = ''.join(passwords)
char_counts = Counter(all_chars)
common_chars = char_counts.most_common(10)

# Plot common characters
chars, counts = zip(*common_chars)
plt.figure(figsize=(10, 5))
sns.barplot(x=list(chars), y=list(counts))
plt.title('Most Common Characters in Passwords')
plt.xlabel('Characters')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Save processed passwords to CSV for further analysis
processed_path = '../data/processed/passwords.csv'
pd.DataFrame(passwords, columns=['password']).to_csv(processed_path, index=False)
print(f'Processed passwords saved to {processed_path}')