In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# ****Data Info

In [None]:
df=pd.read_csv('/kaggle/input/indian-medicine-data/medicine_data.csv')

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

# ****Data Manipulation

In [None]:

df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')

# Check the data type again
print(df['product_price'].dtype)


In [None]:
df['medicine_desc'].fillna('', inplace=True)

In [None]:
import string

# Remove punctuation and convert to lowercase
df['medicine_desc'] = df['medicine_desc'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)).lower())

# ****Exploratory Data Analysis

# Word cloud for 'medicine_desc' to visualize common words in descriptions

In [None]:
from wordcloud import WordCloud

text = " ".join(desc for desc in df['medicine_desc'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Medicine Descriptions')
plt.show()

# 'product_price' vs. 'sub_category'

In [None]:
scatter_fig = px.scatter(df, x='sub_category', y='product_price', title='Product Price vs. Sub Category')
scatter_fig.update_xaxes(categoryorder='total ascending')
scatter_fig.update_xaxes(title_text='Sub Category')
scatter_fig.update_yaxes(title_text='Product Price')
scatter_fig.show()

# 'product_price' by 'product_manufactured'

In [None]:
box_fig = px.box(df, x='product_manufactured', y='product_price', title='Product Price by Manufacturer')
box_fig.update_xaxes(title_text='Manufacturer')
box_fig.update_yaxes(title_text='Product Price')
box_fig.show()

# Interactive histogram of 'product_price'

In [None]:
hist_fig = px.histogram(df, x='product_price', nbins=30, title='Distribution of Product Price')
hist_fig.update_xaxes(title_text='Product Price')
hist_fig.update_yaxes(title_text='Frequency')
hist_fig.show()

# 'sub_category' Distribution

In [None]:
sub_category_counts = df['sub_category'].value_counts()
pie_fig = go.Figure(data=[go.Pie(labels=sub_category_counts.index, values=sub_category_counts.values)])
pie_fig.update_traces(textinfo='percent+label', pull=[0.1, 0.1, 0.1])  # Pull slices for emphasis
pie_fig.update_layout(title='Sub Category Distribution')
pie_fig.show()


# count of unique values in 'product_manufactured'

In [None]:
manufactured_counts = df['product_manufactured'].value_counts()
bar_fig = px.bar(x=manufactured_counts.index, y=manufactured_counts.values, title='Count of Products Manufactured')
bar_fig.update_xaxes(title_text='Manufacturer')
bar_fig.update_yaxes(title_text='Count')
bar_fig.show()

# 'product_price' by 'sub_category'

In [None]:
box_fig = px.box(df, x='sub_category', y='product_price', title='Product Price by Sub Category')
box_fig.update_xaxes(categoryorder='total ascending')
box_fig.update_xaxes(title_text='Sub Category')
box_fig.update_yaxes(title_text='Product Price')
box_fig.show()

# Interactive bar chart for 'side_effects'

In [None]:
side_effects_counts = df['side_effects'].value_counts().head(20)
bar_side_effects_fig = px.bar(x=side_effects_counts.index, y=side_effects_counts.values, title='Top 20 Side Effects')
bar_side_effects_fig.update_xaxes(title_text='Side Effects')
bar_side_effects_fig.update_yaxes(title_text='Count')
bar_side_effects_fig.update_xaxes(categoryorder='total descending')
bar_side_effects_fig.show()

# 'sub_category' and 'product_manufactured'

In [None]:
sunburst_fig = px.sunburst(df, path=['product_manufactured', 'sub_category'], title='Sunburst Chart of Product Manufacturer and Sub Category')
sunburst_fig.show()