In [2]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
Data_1 = pd.read_csv("training_text.csv", sep=r'\|\|', engine='python')
Data_2 = pd.read_csv("training_variants.csv")

In [4]:
Data_1.reset_index(inplace=True)
Data_1.rename(columns={'index': 'ID'}, inplace=True)

In [5]:
Data_1.rename(columns={'ID,Text': 'Text'}, inplace=True)

In [6]:
Data_1.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [7]:
Data_2.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [8]:
print("\033[94mColumns in Data_1:\033[0m", Data_1.columns)
print("\033[94mColumns in Data_2:\033[0m", Data_2.columns)

[94mColumns in Data_1:[0m Index(['ID', 'Text'], dtype='object')
[94mColumns in Data_2:[0m Index(['ID', 'Gene', 'Variation', 'Class'], dtype='object')


In [9]:
print("\033[94mIndex of Data_1:\033[0m", Data_1.index)
print("\033[94mIndex of Data_2:\033[0m", Data_2.index)

[94mIndex of Data_1:[0m RangeIndex(start=0, stop=3321, step=1)
[94mIndex of Data_2:[0m RangeIndex(start=0, stop=3321, step=1)


In [10]:
Merge_Data = pd.merge(Data_1, Data_2, on='ID')

In [11]:
Merge_Data.head(10)

Unnamed: 0,ID,Text,Gene,Variation,Class
0,0,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A,Truncating Mutations,1
1,1,Abstract Background Non-small cell lung canc...,CBL,W802*,2
2,2,Abstract Background Non-small cell lung canc...,CBL,Q249E,2
3,3,Recent evidence has demonstrated that acquired...,CBL,N454D,3
4,4,Oncogenic mutations in the monomeric Casitas B...,CBL,L399V,4
5,5,Oncogenic mutations in the monomeric Casitas B...,CBL,V391I,4
6,6,Oncogenic mutations in the monomeric Casitas B...,CBL,V430M,5
7,7,CBL is a negative regulator of activated recep...,CBL,Deletion,1
8,8,Abstract Juvenile myelomonocytic leukemia (JM...,CBL,Y371H,4
9,9,Abstract Juvenile myelomonocytic leukemia (JM...,CBL,C384R,4


In [12]:
Merge_Data.isnull().sum()

ID           0
Text         5
Gene         0
Variation    0
Class        0
dtype: int64

In [13]:
Merge_Data.duplicated().sum()

0

In [14]:
Merge_Data.dropna(axis = 0, inplace = True)

In [15]:
Merge_Data.isnull().sum()

ID           0
Text         0
Gene         0
Variation    0
Class        0
dtype: int64

In [16]:
Merge_Data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3316 entries, 0 to 3320
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         3316 non-null   int64 
 1   Text       3316 non-null   object
 2   Gene       3316 non-null   object
 3   Variation  3316 non-null   object
 4   Class      3316 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 155.4+ KB


<span style="display: block; width: 100%; height: 4px; background-color: #3498db;"></span>

<font color="red" size="+4"><strong>NLP</strong></font>

<font color="blue">

**Numerical:-->** **ID**, **Class**

**Textual:-->** **Text**

**Categorical:-->** **Gene**, **Variation**

</font>


<span style="display: block; width: 100%; height: 4px; background-color: #3498db;"></span>

In [17]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

Merge_Data['Gene'] = label_encoder.fit_transform(Merge_Data['Gene'])
Merge_Data['Variation'] = label_encoder.fit_transform(Merge_Data['Variation'])

In [18]:
Merge_Data["Gene"].value_counts()

Gene
30     264
250    163
65     141
194    126
31     125
      ... 
214      1
184      1
189      1
257      1
84       1
Name: count, Length: 262, dtype: int64

In [None]:
sns.countplot(x='Class', data=Merge_Data,color = "limegreen")
plt.title('Distribution of Classes')
plt.show()

<span style="display: block; width: 100%; height: 4px; background-color: #3498db;"></span>

<font color="navy">

**1. Seaborn Countplot:**
   **- The code utilizes `seaborn` to create a countplot for the 'Class' variable in the DataFrame `Merge_Data`.**

**2. Class Distribution Visualization:**
   **- It visually represents the distribution of classes within the 'Class' variable.**

**3. Counting Occurrences:**
   **- The height of each bar in the countplot corresponds to the number of occurrences of each class.**

**4. Data Analysis:**
   **- Countplots are useful for analyzing the distribution and frequency of categorical variables.**

**5. Title Setting:**
   **- The title of the countplot is set as **'Distribution of Classes'** using `plt.title`.**

**6. Insights into Class Frequency:**
   **- The plot provides insights into the frequency of each class within the dataset.**

**7. Visual Summary:**
   **- The code offers a visual summary of the distribution of classes.**

**8. Useful for Initial Data Exploration:**
   **- Countplots are valuable for initial data exploration and understanding the composition of categorical variables.**

**9. Crucial for Classification Tasks:**
   **- Understanding class distribution is crucial for tasks like classification, where balanced datasets are often preferred.**

**10. Plot Display:**
    **- The plot is displayed using `plt.show()`.**

</font>

<span style="display: block; width: 100%; height: 4px; background-color: #3498db;"></span>

In [None]:
custom_palette = sns.color_palette("husl", n_colors=len(Merge_Data['Class'].unique()))
sns.pairplot(Merge_Data[['Gene', 'Variation', 'Class']], hue='Class', palette=custom_palette, markers='o')
plt.suptitle('Pairplot for Selected Columns', y=1.02,color= "green")
plt.tight_layout(pad=2.0)
plt.show()

<span style="display: block; width: 100%; height: 4px; background-color: #3498db;"></span>

<font color="navy">

1. **Seaborn Pairplot:**
   **- The code utilizes `seaborn` to create a pairplot for selected columns ('Gene', 'Variation', 'Class') in the DataFrame `Merge_Data`.**

2. **Comprehensive Visualization:**
   **- A pairplot is a grid of scatterplots showing relationships between different pairs of variables.**

3. **Variables in Focus:**
   **- It visualizes the relationships and distributions between 'Gene', 'Variation', and 'Class'.**

4. **Data Point Representation:**
   **- Each point on the scatterplots represents a data point, enabling the observation of potential correlations.**

5. **Pairwise Interaction Analysis:**
   **- Pairplots are useful for understanding the pairwise interactions between variables.**

6. **Title Setting:**
   **- The title of the pairplot is set to 'Pairplot for Selected Columns' using `plt.title`.**

7. **Cluster, Outlier, and Pattern Identification:**
   **- Insights into potential clusters, outliers, or patterns can be derived from the pairplot.**

8. **Visual Overview:**
   **- The code provides a visual overview of the relationships between the specified columns.**

9. **Crucial for Feature Analysis:**
   **- Understanding these relationships is crucial for feature analysis and model development.**

10. **Plot Display:**
    **- The plot is displayed using `plt.show()`.**

</font>

<span style="display: block; width: 100%; height: 4px; background-color: #3498db;"></span>

In [None]:
sns.boxplot(x='Class', y='Gene', data=Merge_Data, color='orange')
boxprops = dict(edgecolor='black', linewidth=2)
plt.title('Boxplot of Gene by Class',color = "Green")
plt.show()

<span style="display: block; width: 100%; height: 4px; background-color: #3498db;"></span>

<font color="navy">

1. **Seaborn Boxplot:**
   **- The provided code utilizes `seaborn` to create a boxplot, specifically focusing on the 'Gene' variable across different classes ('Class') within the DataFrame `Merge_Data`.**

2. **Visualization of Distribution:**
   **- A boxplot visually represents the distribution of the 'Gene' variable for each class, showcasing key statistics such as median, quartiles, and potential outliers.**

3. **Class Comparison:**
   **- Each box in the plot corresponds to a class, allowing for a quick comparison of the central tendency and variability of the 'Gene' values across classes.**

4. **Axes Representation:**
   **- The x-axis represents the different classes, and the y-axis displays the distribution of 'Gene' values.**

5. **Title Setting:**
   **- The title of the boxplot is set as 'Boxplot of Gene by Class' using the `plt.title` function.**

6. **Insights into Variability:**
   **- Insights into the spread and central tendency of 'Gene' across different classes can be gained from the boxplot.**

7. **Visual Summary:**
   **- The code provides a visual summary of how the 'Gene' variable varies with different classes.**

8. **Crucial Understanding:**
   **- Understanding these variations is crucial for feature analysis and identifying potential class-specific patterns.**

9. **Displaying the Plot:**
   **- The plot is displayed using `plt.show()`.**

</font>

<span style="display: block; width: 100%; height: 4px; background-color: #3498db;"></span>

In [None]:
custom_palette = sns.color_palette("husl", n_colors=len(Merge_Data['Class'].unique()))
sns.scatterplot(x='Gene', y='Variation', data=Merge_Data, hue='Class', palette=custom_palette, alpha=0.7, edgecolor='none')
plt.title('Correlation between Gene and Variation')
plt.tight_layout()
plt.show()

<span style="display: block; width: 100%; height: 4px; background-color: #3498db;"></span>

<font color="navy">

1. **Seaborn Scatterplot:**
   **- The code utilizes `seaborn` to create a scatterplot, visualizing the correlation between the 'Gene' and 'Variation' variables in the DataFrame `Merge_Data`.**

2. **Custom Color Palette:**
   **- A custom color palette ('husl') is employed to enhance the visual appeal of the scatterplot. The number of unique classes determines the colors, ensuring clarity and distinction.**

3. **Insights into Correlation:**
   **- The scatterplot allows for the exploration of relationships between 'Gene' and 'Variation'. Each point represents an observation in the dataset, and the color indicates the respective class.**

4. **Transparency Setting:**
   **- An alpha value of 0.7 is applied to the points, introducing transparency and aiding in the visualization of overlapping data points.**

5. **No Edge Color:**
   **- The edges around the points are removed ('edgecolor' set to 'none'), providing a smoother appearance and focusing on the colored points.**

6. **Axes Representation:**
   **- The x-axis corresponds to the 'Gene' variable, while the y-axis represents the 'Variation' variable.**

7. **Title Setting:**
   **- The title of the scatterplot is specified as 'Correlation between Gene and Variation' using the `plt.title` function.**

8. **Layout Adjustment:**
   **- The layout is tightened for a more visually appealing presentation using `plt.tight_layout()`.**

9. **Insights into Relationships:**
   **- The scatterplot aids in identifying patterns, trends, or clusters in the correlation between 'Gene' and 'Variation', potentially offering valuable insights.**

10. **Displaying the Plot:**
    **- The final plot is displayed using `plt.show()`.**

</font>

<span style="display: block; width: 100%; height: 4px; background-color: #3498db;"></span>

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(x='Class', y='Gene', data=Merge_Data, hue='Class', palette='viridis', legend=False)

<span style="display: block; width: 100%; height: 4px; background-color: #3498db;"></span>

<font color="navy">

1. **Seaborn Violin Plot:**
   **- The code utilizes `seaborn` to create a violin plot, specifically focusing on the relationship between the 'Gene' variable and different classes ('Class') within the DataFrame `Merge_Data`.**

2. **Distribution Analysis:**
   **- Violin plots display the distribution of a numeric variable across different categories, offering insights into the central tendency and spread.**

3. **Classwise Gene Distribution:**
   **- Each violin plot in the figure corresponds to a different class, representing the distribution of 'Gene' values for that class.**

4. **Palette Setting:**
   **- The 'viridis' color palette is used for visualizing the violin plots.**

5. **Figure Size Adjustment:**
   **- The size of the figure is adjusted to (12, 8) using `plt.figure(figsize=(12, 8))`.**

6. **Title, X-axis, and Y-axis Labels:**
   **- The title is set as 'Violin Plot of Gene by Class' using `plt.title`.**
   **- The x-axis label is set as 'Class' using `plt.xlabel`.**
   **- The y-axis label is set as 'Gene' using `plt.ylabel`.**

7. **Insights into Gene Distribution:**
   **- Insights into how the distribution of 'Gene' values varies across different classes can be gained from the violin plot.**

8. **Visual Summary:**
   **- The code provides a visual summary of the relationship between 'Gene' and 'Class'.**

9. **Crucial for Feature Analysis:**
   **- Understanding these variations is crucial for feature analysis and identifying potential class-specific patterns.**

10. **Plot Display:**
    **- The plot is displayed using `plt.show()`.**

</font>

<span style="display: block; width: 100%; height: 4px; background-color: #3498db;"></span>

In [None]:
for i in Merge_Data['Text'].values[0:1]:
    print(i)
    print('==================================\n')

In [None]:
import re
def decontracted(phrase):
    phrase=re.sub(r"can\'t",r"can not",phrase)
    phrase=re.sub(r"won't",r"will not",phrase)
    
    #generally doing
    phrase=re.sub(r"\'m",r' am',phrase)
    phrase=re.sub(r"\'d",r' would',phrase)
    phrase=re.sub(r"\'ll",r' will',phrase)
    phrase=re.sub(r"\'ve",r' have',phrase)
    phrase=re.sub(r"\'s",r' is',phrase) 
    phrase=re.sub(r"\'t",r' not',phrase) 
    phrase=re.sub(r"\'re",r' are',phrase)    

    #specificall doing
    for i in ['/', '\\r', '\\n', '?', '\r', '\n', '\"', ':', '(', ')', '!', '\t', '\\t']:
        if i in phrase:
            phrase=phrase.replace(i ,"")
    return phrase

In [None]:
# Check the size of the array
array_size = len(Merge_Data['Text'].values)
print("Array size:", array_size)

# Access elements within the valid range
if 0 <= 20000 < array_size:
    sent = decontracted(Data_1['Text'].values[20000])
    print(sent)
else:
    print("Index 20000 is out of bounds for the array.")

In [None]:
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

In [None]:
from tqdm import tqdm
import numpy as np  # Import numpy for handling NaN values

preprocessed_essay = []
for sentence in tqdm(Merge_Data['Text'].values):
    if isinstance(sentence, str):
        sent = decontracted(sentence)
        sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
        sent = ' '.join(e for e in sent.split() if e not in stop)
        preprocessed_essay.append(sent.lower().strip())
    else:
        # Handle NaN values or non-string elements
        preprocessed_essay.append("")  # or any other handling you prefer

In [None]:
Text = Merge_Data['Text']

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

x = Merge_Data.drop('Class', axis=1)  
y = Merge_Data['Class']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape (X_train, y_train):", X_train.shape, y_train.shape)
print("Testing set shape (X_test, y_test):", X_test.shape, y_test.shape)

In [None]:
x.head()

In [None]:
y

# Numerical
--------------------

<span style="color:Blue; font-size:20px; font-weight:bold;">Normalisation</span>

In [None]:
from sklearn.preprocessing import MinMaxScaler

numeric_features = ["Class"]
scaler = MinMaxScaler()
Merge_Data[numeric_features] = scaler.fit_transform(Merge_Data[numeric_features])

<span style="color:Blue; font-size:20px; font-weight:bold;">Standardisation</span>

In [None]:
from sklearn.preprocessing import StandardScaler

numeric_features = ["Class"]
scaler = StandardScaler()
Merge_Data[numeric_features] = scaler.fit_transform(Merge_Data[numeric_features])

# Categorical
------------------------------

<span style="color:Blue; font-size:20px; font-weight:bold;">LabelEncodeing</span>

In [None]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

Merge_Data['Gene'] = label_encoder.fit_transform(Merge_Data['Gene'])
Merge_Data['Variation'] = label_encoder.fit_transform(Merge_Data['Variation'])

<span style="color:Blue; font-size:20px; font-weight:bold;">OneHotEncoding</span>

In [None]:
Merge_Data_encoded = pd.get_dummies(Merge_Data, columns=['Gene', 'Variation'])

# Text futures

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

Merge_Data['Text'] = Merge_Data['Text'].str.lower()
Merge_Data['Text'] = Merge_Data['Text'].replace('[^\w\s]', '', regex=True)

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

text_features = tfidf_vectorizer.fit_transform(Merge_Data['Text'].astype('U'))

text_df = pd.DataFrame(text_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

Merge_Data = pd.concat([Merge_Data, text_df], axis=1)

Merge_Data = Merge_Data.drop('Text', axis=1)

print(Merge_Data.head())