<a href="https://colab.research.google.com/github/harshita547/Python/blob/main/csv_operations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
# Load the CSV file
file_path = "/content/customers-1000.csv"
df = pd.read_csv(file_path)

# Display basic info and first few rows
info = df.info()
head = df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Index              1000 non-null   int64 
 1   Customer Id        1000 non-null   object
 2   First Name         1000 non-null   object
 3   Last Name          1000 non-null   object
 4   Company            1000 non-null   object
 5   City               1000 non-null   object
 6   Country            1000 non-null   object
 7   Phone 1            1000 non-null   object
 8   Phone 2            1000 non-null   object
 9   Email              1000 non-null   object
 10  Subscription Date  1000 non-null   object
 11  Website            1000 non-null   object
dtypes: int64(1), object(11)
memory usage: 93.9+ KB


In [16]:
# Example of calculating the average of a numerical column - Here, we check if there's a numerical column suitable

# Checking column data types
column_types = df.dtypes

# Samples to find a numerical column for average calculation
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()



# New section

In [17]:
# Convert Subscription Date to datetime type for analysis
import numpy as np

df['Subscription Date'] = pd.to_datetime(df['Subscription Date'], errors='coerce')

# Extract year and month from subscription date

df['Subscription Year'] = df['Subscription Date'].dt.year

df['Subscription Month'] = df['Subscription Date'].dt.month

# Analysis 1: Customers subscribed by year
subscriptions_by_year = df['Subscription Year'].value_counts().sort_index()

# Analysis 2: Count of customers by country (top 10 for visualization)
subscriptions_by_country = df['Country'].value_counts().head(10)

# Analysis 3: Count number of unique companies
unique_companies = df['Company'].nunique()

subscriptions_by_year, subscriptions_by_country, unique_companies

(Subscription Year
 2020    426
 2021    404
 2022    170
 Name: count, dtype: int64,
 Country
 Liechtenstein          12
 Gabon                  10
 Bangladesh              9
 China                   9
 Reunion                 9
 Nigeria                 9
 Luxembourg              9
 Korea                   8
 Montserrat              8
 Antigua and Barbuda     8
 Name: count, dtype: int64,
 992)

In [18]:
import plotly.graph_objects as go
import plotly.io as pio


In [19]:

# Data
years = [2020, 2021, 2022]
subscriptions = [426, 404, 170]

# Create bar chart
fig = go.Figure(data=go.Bar(
    x=years,
    y=subscriptions,
    marker_color='#1FB8CD'
))



In [20]:
# Update layout
fig.update_layout(
    title='Customer Subscriptions by Year',
    xaxis_title='Year',
    yaxis_title='Subscriptions'
)


In [21]:

# Update traces
fig.update_traces(cliponaxis=False)



In [22]:
!pip install -U kaleido



In [23]:
import plotly.graph_objects as go
import plotly.io as pio

# Data from the provided JSON
countries = ["Liechtenstein", "Gabon", "Luxembourg", "Nigeria", "Reunion", "China", "Bangladesh", "American Samoa", "Anguilla", "Antigua & Barb"]
counts = [12, 10, 9, 9, 9, 9, 9, 8, 8, 8]

# Create horizontal bar chart
fig = go.Figure(data=go.Bar(
    x=counts,
    y=countries,
    orientation='h',
    marker_color='#1FB8CD'
))

In [24]:

# Update layout
fig.update_layout(
    title="Top 10 Countries by Customers",
    xaxis_title="Customers",
    yaxis_title="Country"
)

In [25]:
# Update traces
fig.update_traces(cliponaxis=False)



In [26]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# First try to load the raw data from the CSV file
try:
    df = pd.read_csv("customers-1000.csv")
    print("CSV columns:", df.columns.tolist())
    print("Shape:", df.shape)
    print("First few rows:")
    print(df.head())

    # Check if we have subscription year and first name data
    if 'Subscription Year' in df.columns or 'subscription_year' in df.columns:
        print("Found subscription year column in CSV")
    if 'First Name' in df.columns or 'first_name' in df.columns:
        print("Found first name column in CSV")

except Exception as e:
    print(f"Could not load CSV: {e}")
    df = None

# If CSV loading fails or doesn't have the right columns, use provided data
if df is None:
    print("Using provided JSON data")
    data = {
        "Subscription_Year": [2020, 2020, 2021, 2021, 2022, 2022, 2020, 2021, 2022],
        "First_Name_Length": [6, 5, 4, 7, 6, 5, 7, 4, 6]
    }
    df = pd.DataFrame(data)

print("Final dataframe shape:", df.shape)
print("Columns:", df.columns.tolist())

CSV columns: ['Index', 'Customer Id', 'First Name', 'Last Name', 'Company', 'City', 'Country', 'Phone 1', 'Phone 2', 'Email', 'Subscription Date', 'Website']
Shape: (1000, 12)
First few rows:
   Index      Customer Id First Name Last Name                      Company  \
0      1  dE014d010c7ab0c     Andrew   Goodman                Stewart-Flynn   
1      2  2B54172c8b65eC3      Alvin      Lane  Terry, Proctor and Lawrence   
2      3  d794Dd48988d2ac      Jenna   Harding                 Bailey Group   
3      4  3b3Aa4aCc68f3Be   Fernando      Ford                 Moss-Maxwell   
4      5  D60df62ad2ae41E       Kara     Woods              Mccarthy-Kelley   

               City           Country                Phone 1  \
0       Rowlandberg             Macao      846-790-4623x4715   
1          Bethside  Papua New Guinea     124-597-8652x05682   
2      Moniquemouth             China     (335)987-3085x3780   
3        Leeborough             Macao          (047)752-3122   
4  Port Jacks