In [6]:
# Import necessary libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np   # For numerical operations
import matplotlib.pyplot as plt  # For creating static, animated, and interactive visualizations
import seaborn as sns  # For statistical data visualization

# Set plot style for better visualizations
sns.set()
sns.set_palette("deep")

# Explanation:
# We're importing the necessary libraries for our analysis. Pandas and NumPy will be used for data manipulation,
# while Matplotlib and Seaborn will be used for creating visualizations. We're also setting a plot style
# to make our visualizations more appealing and consistent.

In [9]:
def detect_encoding(file_path):
    with open(file_path, 'rb') as file:
        raw_data = file.read()
    return chardet.detect(raw_data)['encoding']

def read_csv_with_encoding(file_path, **kwargs):
    encodings = ['utf-8', 'ISO-8859-1', 'cp1252']

    for encoding in encodings:
        try:
            return pd.read_csv(file_path, encoding=encoding, **kwargs)
        except UnicodeDecodeError:
            continue

    # If all encodings fail, try to detect the encoding
    detected_encoding = detect_encoding(file_path)
    try:
        return pd.read_csv(file_path, encoding=detected_encoding, **kwargs)
    except:
        raise ValueError(f"Unable to read the file with any of the attempted encodings. Detected encoding: {detected_encoding}")


In [14]:
# Load the datasets
# companies = pd.read_csv('companies.txt', sep='\t')  # Tab-separated file
rounds = pd.read_csv('rounds2.csv')  # Comma-separated file
mapping = pd.read_csv('mapping.csv')  # Comma-separated file

# Display dataset shapes
print("Dataset shapes:")
print(f"Companies: {companies.shape}")
print(f"Rounds: {rounds.shape}")
print(f"Mapping: {mapping.shape}")

# Display the first few rows of each dataset
print("\nFirst few rows of companies dataset:")
print(companies.head())
print("\nFirst few rows of rounds dataset:")
print(rounds.head())
print("\nFirst few rows of mapping dataset:")
print(mapping.head())

# Check for missing values
print("\nMissing values in companies dataset:")
print(companies.isnull().sum())
print("\nMissing values in rounds dataset:")
print(rounds.isnull().sum())
print("\nMissing values in mapping dataset:")
print(mapping.isnull().sum())

# Merge the datasets
df = pd.merge(rounds, companies, left_on='company_permalink', right_on='permalink', how='left')
df = pd.merge(df, mapping, on='category_list', how='left')

print("\nMerged dataframe shape:", df.shape)
print("\nColumns in the merged dataframe:")
print(df.columns)

# Explanation:
# 1. We load the three datasets: companies, rounds, and mapping.
# 2. We display the shape of each dataset to understand their size.
# 3. We show the first few rows of each dataset to get an idea of their structure.
# 4. We check for missing values in each dataset to identify potential data quality issues.
# 5. We merge the datasets based on common columns:
#    - First, we merge rounds and companies on 'company_permalink' and 'permalink'
#    - Then, we merge the result with mapping on 'category_list'
# 6. Finally, we display the shape and columns of the merged dataframe.

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 73880: invalid start byte