## Data Exploration Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import sklearn
from ast import literal_eval
from joblib import Parallel, delayed
from collections import Counter


In [None]:
# csv file
df = pd.read_csv('drugs_clean_train.csv')
df

### Statistics

#### avg rating of all drugs

In [None]:
rating = df[['drugName','rating']]

avg_rating = rating.groupby('drugName').mean().round(2)
avg_rating.head()


#### Statistical description

In [None]:
avg_rating.describe().round(2)

In [None]:
print (f'median: {avg_rating.median()}')
print (f'common rating: {avg_rating.mode()}')

- average rating: 7.40
- mid rating: 7.75
- common rating: 10.00

#### Drugs with unsatisfied ratings

In [None]:
unsatisfied = avg_rating[avg_rating['rating'] < 5]
unsatisfied.head()

#### Drugs with satisfied ratings

In [None]:
satisfied = avg_rating[avg_rating['rating'] > 5]
satisfied.head()

In [None]:
print(f'satisfied: {satisfied.count()}')
print(f'unsatisfied: {unsatisfied.count()}')

The data is very imbalanced. Unsatisfied ratings make up only 11% of the data, while satisfied ratings make up 89%. 

#### Relationship between words and ratings

In [None]:
keywords_rating = df[['stemmed','rating']]
keywords_rating

#### Avg length of reviews

In [None]:
review = df['review'].str.split()
df['review_wordcount'] = review.apply(len)
df.head()

In [None]:
avg_length = df['review_wordcount'].mean().round(2)
avg_length

Average length of reviews is 84.7 words.

#### Stats of usefulCount score

In [None]:
avg_useful = df['usefulCount'].describe().round(2)
avg_useful

Average number of users who found a review useful is 28.

##### Find words commonly associated with certain rating values

In [None]:
# group by rating
groups = df[['rating','stemmed']]
groups.sort_values('rating')

Combine all stemmed words into one row for each rating value

In [None]:
# check each item in column is a list and not a string
groups['stemmed'] = groups['stemmed'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x) # .loc used to avoid copying

drugs_keywords = groups.groupby('rating')['stemmed'].agg(sum)
drugs_keywords = drugs_keywords.reset_index()
drugs_keywords.columns = ['rating', 'words']

In [None]:
drugs_keywords.head()

In [None]:
drugs_keywords.to_csv('keywords.csv', index=False)

In [None]:
ddf = pd.read_csv('keywords.csv')
ddf

Extract top 200 words from words from words list and store in new column

In [None]:
def top_250(words):
    if isinstance(words,str): 
        words = literal_eval(words) # make sure the object is treated as a list
    counts = Counter(words)
    return [word for word, _ in counts.most_common(250)] # returns top 250 objects

# use joblib to optimize task by parallel processing
top_250_words = Parallel(n_jobs=-1)(delayed(top_250)(words) for words in ddf['words'])

ddf['top_250_words'] = top_250_words

In [None]:
ddf.drop(columns=['top_200_words'])

In [None]:
ddf_explode = ddf.explode('top_250_words')
frequency_table = pd.crosstab(index=ddf_explode['top_250_words'], columns=ddf_explode['rating'])

columns = {rating: [] for rating in range(1,11)}
top_250_df = pd.DataFrame(columns)

for rating in range(1,11):
    if rating in frequency_table.columns:
        top_250 = frequency_table.nlargest(250,rating).index.tolist()
        top_250_df[rating] = pd.Series(top_250)

top_250_df.head()

In [None]:
top_250_df.to_csv('top_250.csv')

#### Create artificial data to balance out data

In [None]:
df = pd.read_csv('drugs_clean_train.csv')

In [None]:
class_counts = df['class'].value_counts()
max_size = class_counts.max()

In [None]:
balanced_df = pd.DataFrame()

for class_index, group in df.groupby('class'):
    oversampled_group = group.sample(n=max_size, replace=True, random_state=42)
    balanced_df = pd.concat([balanced_df, oversampled_group], axis=0)

In [None]:
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
balanced_df.head()

all data is equally represented/balanced now in the class column

In [None]:
balanced_df.to_csv('balanced_df.csv')