# 1. Importing Libraries
First, import the necessary libraries:

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.datasets import load_breast_cancer


# 2. Loading Data
 Load the data using scikit-learn:

In [None]:
# Load the dataset
data = load_breast_cancer()


# 3. Creating a DataFrame

Convert the loaded data into a pandas DataFrame:

In [None]:
# Create a DataFrame with the feature data
X = pd.DataFrame(data.data, columns=data.feature_names)


# 4. Exploring the DataFrame

Explore basic information and statistics from the DataFrame:

In [None]:
# Display the first few rows of the DataFrame
# The head() method returns the first 5 rows if a number is not specified.
print(X.head())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst radius  worst texture  worst perimeter  \
0           

In [None]:
# Get a summary of statistics
#count: Total number of non-null values in the column.
#mean: Average value of the column.
#std: Standard deviation, showing how spread out the values are.
#min: Minimum value in the column.
#25%: 25th percentile (Q1).
#50%: Median value (50th percentile).
#75%: 75th percentile (Q3).
#max: Maximum value in the column.

print(X.describe())

       mean radius  mean texture  mean perimeter    mean area  \
count   569.000000    569.000000      569.000000   569.000000   
mean     14.127292     19.289649       91.969033   654.889104   
std       3.524049      4.301036       24.298981   351.914129   
min       6.981000      9.710000       43.790000   143.500000   
25%      11.700000     16.170000       75.170000   420.300000   
50%      13.370000     18.840000       86.240000   551.100000   
75%      15.780000     21.800000      104.100000   782.700000   
max      28.110000     39.280000      188.500000  2501.000000   

       mean smoothness  mean compactness  mean concavity  mean concave points  \
count       569.000000        569.000000      569.000000           569.000000   
mean          0.096360          0.104341        0.088799             0.048919   
std           0.014064          0.052813        0.079720             0.038803   
min           0.052630          0.019380        0.000000             0.000000   
25%      

In [None]:
# Get info about the DataFrame
print(X.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

# 5. Creating a Series

Create a pandas Series for the target variable:

In [None]:
# Create a Series for the target variable
# pd.Series(): This command creates a pandas Series,
# which is a one-dimensional array-like object capable of holding data of any type
# (integer, string, float, python objects, etc.).
# The Series has an index, making it very flexible for data manipulation.

# data.target: In the context of scikit-learn datasets,
# such as the Breast Cancer Wisconsin (Diagnostic) dataset,
# data.target contains the dependent variable or labels associated with each
# observation in the data set.
# These labels are typically numerical and represent classes or categories
#(e.g., 0 for benign, 1 for malignant in a cancer dataset).

y = pd.Series(data.target)


# 6. Exploring the Series

Perform some basic exploration on the Series:

In [None]:
# Display the first few elements of the Series
print(y.head())

# By default, head() displays the first five elements,
# but you can pass a number as an argument to specify a different number of elements.
# In the case of y, which contains the target labels from the Breast Cancer dataset
# (where 0 might represent benign tumors and 1 malignant tumors),
# y.head() gives you a quick snapshot of the first five label values.


0    0
1    0
2    0
3    0
4    0
dtype: int64


In [None]:
y.value_counts()

# This method is used to count the unique values in the Series, providing a tally of each unique value.
# It is extremely useful for categorical data to understand the distribution of classes within the data.
# For the Series y, y.value_counts() will count how many instances there are of each class
 #(benign and malignant in this context).
 #This provides an insight into the balance of the dataset,
 #which is crucial for machine learning models, as imbalanced data can lead to biased predictions.
 #The output will show something like 1: 357, 0: 212, indicating there are 357 malignant cases and
 #212 benign cases if 1 represents malignant.

# Count unique values in the Series
print(y.value_counts())

1    357
0    212
Name: count, dtype: int64
