In [None]:
import pandas as pd
import numpy as np

data_np = np.array([
    ['Blue', [1, 2], 1.1],
    ['Red', [3, 4], 2.2],
    ['Pink', [5, 6], 3.3],
    ['Grey', [7, 8], 4.4],
    ['Black', [9, 10], 5.5]
], dtype=object)

ecommerce_from_numpy = pd.DataFrame(data_np, index=[1, 3, 5, 7, 9],
                             columns=['color', 'list', 'number'])

print("DataFrame from NumPy array:")
print(ecommerce_from_numpy)

# Create DataFrame from Pandas Series
data_series = {
    'color': pd.Series(['Blue', 'Red', 'Pink', 'Grey', 'Black'],
                       index=[1, 3, 5, 7, 9]),
    'list': pd.Series([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]],
                      index=[1, 3, 5, 7, 9]),
    'number': pd.Series([1.1, 2.2, 3.3, 4.4, 5.5], index=[1, 3, 5,
                                                              7, 9])
}

ecommerce_from_series = pd.DataFrame(data_series)

print("DataFrame from Pandas Series:")
print(ecommerce_from_series)

# Print column types
print("Column types:")
print(ecommerce_from_numpy.dtypes)

# Print types of the first value of every column
print("Types of the first value of every column:")
for col in ecommerce_from_numpy.columns:
    print(f"Column '{col}': {type(ecommerce_from_numpy[col].iloc[0])}")

In [None]:
# --- Data Loading, Cleaning, and Transformation ---

# 1. Load the dataset
print("Step 1: Loading data...")
file_path = 'household_power_consumption_2.txt'
ecommerce = pd.read_csv(file_path, sep=';', na_values=['?'])
ecommerce = ecommerce.drop(columns=['Time', 'Sub_metering_2', 'Sub_metering_3'])
ecommerce = ecommerce.set_index('Date')

# 2. Update data types
print("Step 2: Updating data types...")
def update_types(ecommerce_to_update):
    for col in ecommerce_to_update.columns:
        ecommerce_to_update[col] = pd.to_numeric(ecommerce_to_update[col], errors='coerce')
    ecommerce_to_update.index = pd.to_datetime(ecommerce_to_update.index, format='%d/%m/%Y')
    return ecommerce_to_update
ecommerce_updated = update_types(ecommerce)

ecommerce_updated.describe()

# 3. Drop rows with missing values and create a copy
print("Step 3: Dropping missing values...")
ecommerce_cleaned = ecommerce_updated.dropna().copy()

print("--- Sub_metering_1 before modification ---")
print(ecommerce_cleaned['Sub_metering_1'].head())

# 4. Apply the transformation
print("Step 4: Applying transformation...")
ecommerce_cleaned['Sub_metering_1'] = (ecommerce_cleaned['Sub_metering_1'] + 1) * 0.06

print("--- Sub_metering_1 after modification ---")
print(ecommerce_cleaned['Sub_metering_1'].head())

  # 1. Select rows where Date >= 2008-12-27 and Voltage >= 242
print("--- 1. Filtering Data ---")
filtered_ecommerce = ecommerce_cleaned[(ecommerce_cleaned.index >= '2008-12-27') & (ecommerce_cleaned['Voltage'] >= 242)]
print(f"Found {len(filtered_ecommerce)} rows matching the criteria.")

print("\n--- 2. 88888th Row of Filtered Data ---")
if len(filtered_ecommerce) > 88888:
    print(filtered_ecommerce.iloc[88888])
else:
    print("There are not enough rows in the filtered data to select the 88888th row.")


  # 3. Find the date of the maximum Global_active_power
print("\n--- 3. Date of Maximum Global Active Power ---")
max_power_date = ecommerce_cleaned['Global_active_power'].idxmax()
print(f"The Global_active_power was maximal on: {max_power_date.date()}")


  # 4. Sort the first three columns
print("\n--- 4. Sorted DataFrame (First 3 Columns) ---")
sorted_ecommerce = ecommerce_cleaned.sort_values(by=['Global_active_power','Voltage'], ascending=[False, True])
print(sorted_ecommerce.iloc[:, :3].head()) # Displaying the first 3 columns of the sorted result


  # 5. Compute the daily average of Global_active_power
print("\n--- 5. Daily Average of Global Active Power ---")
daily_avg_power = ecommerce_cleaned['Global_active_power'].resample('D').mean()
print(daily_avg_power.head())



In [None]:
ecommerce = pd.read_csv("Ecommerce_purchases.txt", sep=',')
#print(ecommerce.head())

print(f"Total Rows = {len(ecommerce.index)}")
print(f"Total Rows = {len(ecommerce.columns)}")

purchase_price = np.mean(ecommerce['Purchase Price'])
print(f"Average Purchase Price = {purchase_price}")

english_speakers = ecommerce['Language']
num_english_speakers = len(english_speakers[english_speakers == 'en'])
print(f"Number of English Speakers = {num_english_speakers}")

jobs = ecommerce['Job']
num_lawyers = len(jobs[jobs == 'Lawyer'])
print(f"Number of Lawyers = {num_lawyers}")

am_buyers = ecommerce['AM or PM']
num_am_buyers = len(am_buyers[am_buyers == 'AM'])
print(f"Number of AM Buyers = {num_am_buyers}")

num_pm_buyers = len(am_buyers[am_buyers == 'PM'])
print(f"Number of PM Buyers = {num_pm_buyers}")

top_jobs = jobs.value_counts()
# print(f"Top jobs {top_jobs.head(5)}")

# Step 1: Create a boolean "mask" to find which row has the Lot '90 WT'
# This will be a Series of True/False values.
is_the_correct_lot = ecommerce['Lot'] == '90 WT'

# Step 2: Use the mask to select the entire row from the DataFrame.
# .loc is great for selecting data by labels or boolean conditions.
transaction_row = ecommerce.loc[is_the_correct_lot]

# Step 3: From that specific row, select the value in the 'Purchase Price' column.
# We use .item() to pull the single value out of the Series.
purchase_price = transaction_row['Purchase Price'].item()
print(f"Purchase Price for Lot '90 WT' = {purchase_price}")

credit_card_to_find = 4926535242672853
email_address = ecommerce.loc[ecommerce['Credit Card'] == credit_card_to_find, 'Email'].item()

print(f"Email of the person with Credit Card number '4926535242672853' = {email_address}")

american_express = ecommerce[(ecommerce['CC Provider'] == 'American Express') & (ecommerce['Purchase Price'] >= 95.0)]
print(f'Total american express above $95 {len(american_express)}')


expiry = ecommerce['CC Exp Date'].str.endswith('25')
num_expiring_2025 = expiry.sum()
print(f'Cards expiring in 2025 = {num_expiring_2025}')


ecommerce.columns = ecommerce.columns.str.strip()

# --- Step 1: Extract the domain from each email address ---
# We use the .str accessor to apply string operations to the 'Email' column.
# .split('@') splits each email into a list of two parts: the name and the domain.
# .str[1] selects the second part of that list, which is the domain.
email_providers = ecommerce['Email'].str.split('@').str[1]

# --- Step 2: Count the occurrences of each provider ---
# value_counts() is the perfect tool for this. It counts unique values and
# sorts them in descending order automatically.
provider_counts = email_providers.value_counts()

# --- Step 3: Get the top 5 most popular providers ---
# .head(5) selects the first 5 rows from the sorted counts.
top_5_providers = provider_counts.head(5)

print("Top 5 most popular email providers:")
print(top_5_providers)

