In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
directory = '/kaggle/input/customer-support-on-twitter'
files = os.listdir(directory)
print(files)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/customer-support-on-twitter/sample.csv')


In [None]:
data.head()

> **Mise en minuscules**

In [None]:
data["text_lower"] = data["text"].str.lower()
data.head()


****Suppression des ponctuations****

In [None]:
import string

# Define the punctuation to remove
punc_to_remove = string.punctuation

# Function to remove punctuation from text
def remove_punc(text):
    return text.translate(str.maketrans('', '', punc_to_remove))

# Apply the remove_punc function to the 'text' column and store the result in a new column 'punc_remove'
data['punc_remove'] = data['text'].apply(remove_punc)


# Display the first few rows of the dataframe
data.head()


**Suppression des mots vides**

In [None]:
from nltk.corpus import stopwords

# Get the set of English stopwords
stopwords_set = set(stopwords.words('english'))

# Function to remove stopwords from text
def remove_stopwords(text):
    # Split the text into words and filter out stopwords
    words = text.split()
    filtered_words = [word for word in words if word not in stopwords_set]
    # Join the filtered words back into a string
    return ' '.join(filtered_words)

# Apply remove_stopwords function to the 'punc_remove' column and store the result in a new column 'stopwords_remove'
data['stopwords_remove'] = data['punc_remove'].apply(remove_stopwords)

# Display the first 5 rows of the dataframe
data.head(5)


**Suppression des mots fréquents**

In [None]:
from collections import Counter

# Create a Counter object
word_counter = Counter()

# Loop through each text in the 'stopwords_remove' column
for text in data['stopwords_remove'].values:
    # Split the text into words and update the counter
    for word in text.split():
        word_counter[word] += 1

# Get the 10 most common words and their counts
most_common_words = word_counter.most_common(10)

print(most_common_words)


**Suppression des mots rares**

In [None]:
from collections import Counter

# Create a Counter object to count word occurrences
word_counter = Counter()

# Loop through each text in the 'stopwords_remove' column
for text in data['stopwords_remove'].values:
    # Split the text into words and update the counter
    for word in text.split():
        word_counter[word] += 1

# Define a threshold for word frequency (adjust as needed)
frequency_threshold = 5

# Filter out words that occur less frequently than the threshold
common_words = {word for word, count in word_counter.items() if count >= frequency_threshold}

# Function to remove rare words from text
def remove_rare_words(text):
    # Split the text into words and filter out rare words
    words = text.split()
    filtered_words = [word for word in words if word in common_words]
    # Join the filtered words back into a string
    return ' '.join(filtered_words)

# Apply remove_rare_words function to the 'stopwords_remove' column and store the result in a new column 'common_words_remove'
data['common_words_remove'] = data['stopwords_remove'].apply(remove_rare_words)

# Display the first 5 rows of the dataframe
data.head(5)


**Stemming**

In [None]:
from nltk.stem.porter import PorterStemmer

# Create a PorterStemmer object
stemmer = PorterStemmer()

# Function to stem words in text
def stem_words(text):
    # Split the text into words, stem each word, and join them back into a string
    return ' '.join([stemmer.stem(word) for word in text.split()])

# Apply the stem_words function to the 'text' column and store the result in a new column 'stemmer_text'
data['stemmer_text'] = data['text'].apply(stem_words)

# Display the first 2 rows of the dataframe
data.head(2)


**Lemmatisation**

In [None]:
import nltk
nltk.download('wordnet')


from nltk.stem import WordNetLemmatizer
lemmatizer= WordNetLemmatizer()

def lemmatize_words(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])



**Suppression des émojis , Suppression des émoticônes**

In [66]:
import re

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

print(remove_emoji("GI4 is on 🔥🔥"))


GI4 is on 


**Suppression des URL, Suppression des balises**

In [None]:
import re

# Function to remove URLs from text
def remove_urls(text):
    # Define the pattern for URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    # Remove URLs using the pattern
    return url_pattern.sub(r'', text)

# Function to remove HTML tags from text
def remove_html_tags(text):
    # Define the pattern for HTML tags
    html_pattern = re.compile(r'<.*?>')
    # Remove HTML tags using the pattern
    return html_pattern.sub(r'', text)

# Example usage:
text_with_urls_html = "Check out my website at https://example.com! <p>This is a paragraph.</p>"
text_without_urls = remove_urls(text_with_urls_html)
text_without_urls_html = remove_html_tags(text_without_urls)
print(text_without_urls_html)  
