In [14]:
import re
import pandas as pd

def parse_log_file(file_path):
    with open(file_path, 'r') as file:
        logs = file.readlines()

    data = []
    log_entry = ""
    timestamp_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}\+\d{2}:\d{2}$')

    for line in logs:
        if timestamp_pattern.match(line.strip()):
            if log_entry:
                data.append(log_entry.strip())
            log_entry = line.strip() + " "
        else:
            log_entry += line.strip() + " "
    
    if log_entry:
        data.append(log_entry.strip())

    return pd.DataFrame(data, columns=['log_message'])
file_path = "C:\\Users\\AZA\\Desktop\\PFA\\DATA\\alert_FRET.log"

# Parsing the log file
df_logs = parse_log_file(file_path)

# Displaying the first few parsed log entries
print(df_logs.head(10))


                                         log_message
0  2023-10-31T00:22:56.097799+01:00 Starting ORAC...
1  2023-10-31T00:22:56.123085+01:00 *************...
2  2023-10-31T00:22:56.124890+01:00 Dump of syste...
3  2023-10-31T00:22:56.127019+01:00 Available sys...
4  2023-10-31T00:22:56.129951+01:00 Supported sys...
5  2023-10-31T00:22:56.131802+01:00 PAGESIZE  AVA...
6  2023-10-31T00:22:56.133304+01:00 4K       Conf...
7  2023-10-31T00:22:56.134781+01:00 Reason for no...
8  2023-10-31T00:22:56.136356+01:00 64K - Default...
9  2023-10-31T00:22:56.138132+01:00 16M - Default...


In [12]:
# Sélectionner un échantillon de 200 logs pour l'étiquetage manuel

sample_logs = df_logs.head(200).reset_index(drop=True)

# Display the first 10 rows of the selected sample



TypeError: 'DataFrame' object is not callable

In [4]:
# Définir des catégories (labels) pour les logs avec gravité
categories = {
    'information': 0,
    'warning': 1,
    'error': 2,
    'critical': 3
}


In [5]:
def manual_labeling_with_severity(logs):
    labels = []
    print("Veuillez étiqueter les logs avec les catégories suivantes et leur gravité correspondante:")
    for category, severity in categories.items():
        print(f"{category}: {severity}")
    
    for index, log in logs.iterrows():
        print(f"\nLog {index + 1}:")
        print(log['log_message'])
        label = input("Entrez la catégorie (information, warning, error, critical): ")
        while label not in categories:
            print("Catégorie invalide. Veuillez entrer une catégorie valide.")
            label = input("Entrez la catégorie (information, warning, error, critical): ")
        labels.append(categories[label])
    
    return labels

# Étiqueter manuellement les logs
sample_labels = manual_labeling_with_severity(sample_logs)
sample_logs['severity'] = sample_labels


Veuillez étiqueter les logs avec les catégories suivantes et leur gravité correspondante:
information: 0
error: 2
critical: 3

Log 1:
2023-10-31T00:22:56.097799+01:00 Starting ORACLE instance (normal) (OS id: 2359880)

Log 2:
2023-10-31T00:22:56.123085+01:00 **********************************************************************
Catégorie invalide. Veuillez entrer une catégorie valide.
Catégorie invalide. Veuillez entrer une catégorie valide.

Log 3:
2023-10-31T00:22:56.124890+01:00 Dump of system resources acquired for SHARED GLOBAL AREA (SGA)

Log 4:
2023-10-31T00:22:56.127019+01:00 Available system pagesizes: 4K, 64K, 16M, 16G

Log 5:
2023-10-31T00:22:56.129951+01:00 Supported system pagesize(s):

Log 6:
2023-10-31T00:22:56.131802+01:00 PAGESIZE  AVAILABLE_PAGES  EXPECTED_PAGES  ALLOCATED_PAGES  ERROR(s)

Log 7:
2023-10-31T00:22:56.133304+01:00 4K       Configured         6291472         6291472        NONE

Log 8:
2023-10-31T00:22:56.134781+01:00 Reason for not supporting certain sy

In [6]:
sample_logs.to_csv("sample_logs_labeled.csv", index=False)
print("Labeled sample logs have been saved to 'sample_logs_labeled.csv'.")

Labeled sample logs have been saved to 'sample_logs_labeled.csv'.


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Load the manually labeled sample
labeled_sample_path = "sample_logs_labeled.csv"
labeled_sample = pd.read_csv(labeled_sample_path)

# Split the data into features and labels
X = labeled_sample['log_message']
y = labeled_sample['severity']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.94      1.00      0.97        93
           1       0.00      0.00      0.00         1
           2       1.00      0.17      0.29         6

    accuracy                           0.94       100
   macro avg       0.65      0.39      0.42       100
weighted avg       0.93      0.94      0.92       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
# Convert all log messages to TF-IDF features
all_logs_tfidf = vectorizer.transform(df_logs['log_message'])

# Predict the labels for all logs
all_predicted_labels = model.predict(all_logs_tfidf)

# Add the predicted labels to the DataFrame
df_logs['predicted_severity'] = all_predicted_labels

# Save the labeled logs to a CSV file
#df_logs.to_csv("all_logs_labeled.csv", index=False)
#print("All logs have been labeled and saved to 'all_logs_labeled.csv'.")


In [17]:
new_sample_logs = df_logs.iloc[500:1000].reset_index(drop=True)
new_sample_logs.head(10)


Unnamed: 0,log_message
0,2023-10-31T07:50:25.332785+01:00 ARC0 (PID:334...
1,2023-10-31T07:50:25.334567+01:00 In-memory ope...
2,2023-10-31T07:50:25.506527+01:00 Archiving pre...
3,2023-10-31T07:50:26.277712+01:00 TMON (PID:439...
4,2023-10-31T07:50:26.316893+01:00 ARC1 started ...
5,2023-10-31T07:50:26.362546+01:00 ARC2 started ...
6,2023-10-31T07:50:26.407257+01:00 ARC3 started ...
7,2023-10-31T07:50:26.450213+01:00 ARC4 started ...
8,2023-10-31T07:50:26.488669+01:00 ARC5 started ...
9,2023-10-31T07:50:26.529093+01:00 ARC6 started ...


In [18]:
def manual_labeling_with_severity(logs):
    labels = []
    print("Veuillez étiqueter les logs avec les catégories suivantes et leur gravité correspondante:")
    for category, severity in categories.items():
        print(f"{category}: {severity}")
    
    for index, log in logs.iterrows():
        print(f"\nLog {index + 1}:")
        print(log['log_message'])
        label = input("Entrez la catégorie (information, warning, error, critical): ")
        while label not in categories:
            print("Catégorie invalide. Veuillez entrer une catégorie valide.")
            label = input("Entrez la catégorie (information, warning, error, critical): ")
        labels.append(categories[label])
    
    return labels

# Étiqueter manuellement les logs
sample_labels = manual_labeling_with_severity(new_sample_logs)
new_sample_logs['severity'] = sample_labels

Veuillez étiqueter les logs avec les catégories suivantes et leur gravité correspondante:
information: 0
error: 2
critical: 3

Log 1:
2023-10-31T07:50:25.332785+01:00 ARC0 (PID:3342926): Becoming a 'no FAL' ARCH

Log 2:
2023-10-31T07:50:25.334567+01:00 In-memory operation on ADG is currently only supported on Engineered systems and PaaS. inmemory_adg_enabled is turned off automatically. Please contact our support team for EXADATA solutions Lost write protection disabled .... (PID:56033732): Using STANDBY_ARCHIVE_DEST parameter default value as /archive/FRET [krsd.c:18141]

Log 3:
2023-10-31T07:50:25.506527+01:00 Archiving previously deferred ORLs (FRET) Completed: ALTER DATABASE   MOUNT

Log 4:
2023-10-31T07:50:26.277712+01:00 TMON (PID:4391574): STARTING ARCH PROCESSES Starting background process ARC1

Log 5:
2023-10-31T07:50:26.316893+01:00 ARC1 started with pid=37, OS id=53805482 Starting background process ARC2

Log 6:
2023-10-31T07:50:26.362546+01:00 ARC2 started with pid=43, OS i

In [19]:
new_sample_logs.to_csv("222sample_logs_labeled222.csv", index=False)
print("Labeled sample logs have been saved to 'sample_logs_labeled.csv'.")

Labeled sample logs have been saved to 'sample_logs_labeled.csv'.


In [20]:
import pandas as pd

# Load the previously labeled data
existing_labeled_data = pd.read_csv('sample_logs_labeled.csv')


# Assume you manually label these new samples and save them
# Load the newly labeled data
new_labeled_data = pd.read_csv('222sample_logs_labeled222.csv')

# Append the new labeled data to the existing data
combined_labeled_data = pd.concat([existing_labeled_data, new_labeled_data], ignore_index=True)

# Save the combined data back to the file
combined_labeled_data.to_csv('labeled_data.csv', index=False)


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Load the manually labeled sample
labeled_sample_path = "labeled_data.csv"
labeled_sample = pd.read_csv(labeled_sample_path)

# Split the data into features and labels
X = labeled_sample['log_message']
y = labeled_sample['severity']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.96      0.99      0.97       155
           1       0.00      0.00      0.00         3
           2       0.95      0.90      0.93        42

    accuracy                           0.95       200
   macro avg       0.64      0.63      0.63       200
weighted avg       0.94      0.95      0.95       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
# Count the number of warnings in the log file
pattern = r'error'
num_warnings = df_logs[df_logs['log_message'].str.contains(pattern, case=False)].shape[0]
print(f"Number of warnings: {num_warnings}")




In [28]:
import pandas as pd

# Adjust display settings to show all rows
pd.set_option('display.max_rows', None)

# Define the pattern to match the log entries
#pattern = r'panic|fatal|incident|crash|critical|failure|halt'
pattern = r'ORA-00313|ORA-00312|ORA-27037|ORA-00344|ORA-27044'

# Filter the DataFrame to get the log entries that match the pattern
matching_logs = df_logs[df_logs['log_message'].str.contains(pattern, case=False)]

# Display the matching log entries
print(matching_logs)


                                              log_message
710     2023-10-31T08:12:57.337645+01:00 Errors in fil...
711     2023-10-31T08:12:57.339717+01:00 Errors in fil...
712     2023-10-31T08:12:57.343295+01:00 Errors in fil...
714     2023-10-31T08:12:57.357188+01:00 Errors in fil...
715     2023-10-31T08:12:57.359582+01:00 Errors in fil...
716     2023-10-31T08:12:57.361359+01:00 Errors in fil...
717     2023-10-31T08:12:57.364058+01:00 Errors in fil...
718     2023-10-31T08:12:57.366761+01:00 Errors in fil...
719     2023-10-31T08:12:57.369979+01:00 Errors in fil...
720     2023-10-31T08:12:57.372269+01:00 Errors in fil...
721     2023-10-31T08:12:57.375430+01:00 Errors in fil...
722     2023-10-31T08:12:57.392545+01:00 Errors in fil...
724     2023-10-31T08:12:57.436760+01:00 Errors in fil...
727     2023-10-31T08:12:57.544959+01:00 Errors in fil...
730     2023-10-31T08:12:57.629481+01:00 Errors in fil...
732     2023-10-31T08:12:57.697211+01:00 Errors in fil...
737     2023-1

In [45]:
labeled_sample_path = "labeled_data.csv"
labeled_sample = pd.read_csv(labeled_sample_path)
# Define a pattern to match critical errors
critical_pattern = r'ORA-00344|ORA-27044'

# Update severity to 3 for matching log messages
labeled_sample.loc[labeled_sample['log_message'].str.contains(critical_pattern, case=False), 'severity'] = 3

# Verify the updates
labeled_sample[labeled_sample['severity'] == 3]


Unnamed: 0,log_message,severity
808,2023-10-31T08:13:56.635882+01:00 Errors in fil...,3
813,2023-10-31T08:13:56.657260+01:00 Errors in fil...,3
991,2023-10-31T08:33:48.460364+01:00 Errors in fil...,3
996,2023-10-31T08:33:48.482789+01:00 Errors in fil...,3


In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Load the manually labeled sample
labeled_sample_path = "updated_labeled_data.csv"
labeled_sample = pd.read_csv(labeled_sample_path)

# Split the data into features and labels
X = labeled_sample['log_message']
y = labeled_sample['severity']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.96      0.99      0.97       155
           1       0.00      0.00      0.00         3
           2       0.95      0.90      0.93        42

    accuracy                           0.95       200
   macro avg       0.64      0.63      0.63       200
weighted avg       0.94      0.95      0.95       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [47]:
new_sample_logs1 = df_logs.iloc[1000:1200].reset_index(drop=True)
new_sample_logs1.head(10)

Unnamed: 0,log_message
0,2023-10-31T08:33:48.496097+01:00 Errors in fil...
1,2023-10-31T08:33:48.502500+01:00 Errors in fil...
2,2023-10-31T08:33:48.504404+01:00 Errors in fil...
3,2023-10-31T08:33:48.870675+01:00 Errors in fil...
4,2023-10-31T08:33:48.874964+01:00 Errors in fil...
5,2023-10-31T08:33:48.879400+01:00 Errors in fil...
6,2023-10-31T08:33:48.881521+01:00 Errors in fil...
7,2023-10-31T08:33:48.887440+01:00 Errors in fil...
8,2023-10-31T08:33:48.889611+01:00 Errors in fil...
9,2023-10-31T08:33:48.897446+01:00 Errors in fil...


In [48]:
def manual_labeling_with_severity(logs):
    labels = []
    print("Veuillez étiqueter les logs avec les catégories suivantes et leur gravité correspondante:")
    for category, severity in categories.items():
        print(f"{category}: {severity}")
    
    for index, log in logs.iterrows():
        print(f"\nLog {index + 1}:")
        print(log['log_message'])
        label = input("Entrez la catégorie (information, warning, error, critical): ")
        while label not in categories:
            print("Catégorie invalide. Veuillez entrer une catégorie valide.")
            label = input("Entrez la catégorie (information, warning, error, critical): ")
        labels.append(categories[label])
    
    return labels

# Étiqueter manuellement les logs
sample_labels = manual_labeling_with_severity(new_sample_logs1)
new_sample_logs1['severity'] = sample_labels

Veuillez étiqueter les logs avec les catégories suivantes et leur gravité correspondante:
information: 0
error: 2
critical: 3

Log 1:
2023-10-31T08:33:48.496097+01:00 Errors in file /oracle/ora_base/diag/rdbms/fretdg/FRET/trace/FRET_tt02_19792228.trc: ORA-00367: checksum error in log file header ORA-00316: log 1 of thread 1, type 0 in header is not log file ORA-00312: online log 1 thread 1: '/redolog/FRET/redo011_1.log' TT02 (PID:19792228): Clearing ORL LNO:1 /redolog/FRET/redo011_1.log Clearing online log 1 of thread 1 sequence number 0

Log 2:
2023-10-31T08:33:48.502500+01:00 Errors in file /oracle/ora_base/diag/rdbms/fretdg/FRET/trace/FRET_tt02_19792228.trc: ORA-00313: open failed for members of log group 1 of thread 1 ORA-00312: online log 1 thread 1: '/redolog/FRET/redo011_2.log' ORA-27037: unable to obtain file status IBM AIX RISC System/6000 Error: 2: No such file or directory Additional information: 7

Log 3:
2023-10-31T08:33:48.504404+01:00 Errors in file /oracle/ora_base/diag

In [49]:
new_sample_logs1.to_csv("222sample_logs_labeled222.csv", index=False)
print("Labeled sample logs have been saved to 'sample_logs_labeled.csv'.")

Labeled sample logs have been saved to 'sample_logs_labeled.csv'.


In [50]:
import pandas as pd

# Load the previously labeled data
existing_labeled_data = pd.read_csv('labeled_data.csv')


# Assume you manually label these new samples and save them
# Load the newly labeled data
new_labeled_data = pd.read_csv('222sample_logs_labeled222.csv')

# Append the new labeled data to the existing data
combined_labeled_data = pd.concat([existing_labeled_data, new_labeled_data], ignore_index=True)

# Save the combined data back to the file
combined_labeled_data.to_csv('fianl2.csv', index=False)

In [59]:
import pandas as pd

# Load the CSV file
labeled_sample_path = "fianl2.csv"
labeled_sample = pd.read_csv(labeled_sample_path)

# Define a pattern to match critical errors
critical_pattern = r'ORA-00313|ORA-00344|ORA-27044'

# Update severity to 3 for matching log messages
labeled_sample.loc[labeled_sample['log_message'].str.contains(critical_pattern, case=False), 'severity'] = 3

# Verify the updates
print(labeled_sample[labeled_sample['severity'] == 3])

# Save the updated DataFrame to a new CSV file
new_file_path = "FINAL_severity.csv"
labeled_sample.to_csv(new_file_path, index=False)


                                            log_message  severity
710   2023-10-31T08:12:57.337645+01:00 Errors in fil...         3
711   2023-10-31T08:12:57.339717+01:00 Errors in fil...         3
712   2023-10-31T08:12:57.343295+01:00 Errors in fil...         3
714   2023-10-31T08:12:57.357188+01:00 Errors in fil...         3
715   2023-10-31T08:12:57.359582+01:00 Errors in fil...         3
716   2023-10-31T08:12:57.361359+01:00 Errors in fil...         3
717   2023-10-31T08:12:57.364058+01:00 Errors in fil...         3
718   2023-10-31T08:12:57.366761+01:00 Errors in fil...         3
719   2023-10-31T08:12:57.369979+01:00 Errors in fil...         3
720   2023-10-31T08:12:57.372269+01:00 Errors in fil...         3
721   2023-10-31T08:12:57.375430+01:00 Errors in fil...         3
722   2023-10-31T08:12:57.392545+01:00 Errors in fil...         3
730   2023-10-31T08:12:57.629481+01:00 Errors in fil...         3
732   2023-10-31T08:12:57.697211+01:00 Errors in fil...         3
742   2023

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Load the manually labeled sample
labeled_sample_path = "FINAL_severity.csv"
labeled_sample = pd.read_csv(labeled_sample_path)

# Split the data into features and labels
X = labeled_sample['log_message']
y = labeled_sample['severity']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       170
           1       0.00      0.00      0.00         3
           2       1.00      0.06      0.12        31
           3       0.63      1.00      0.77        36

    accuracy                           0.87       240
   macro avg       0.64      0.52      0.47       240
weighted avg       0.89      0.87      0.82       240



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [61]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the manually labeled sample
labeled_sample_path = "FINAL_severity.csv"
labeled_sample = pd.read_csv(labeled_sample_path)

# Split the data into features and labels
X = labeled_sample['log_message']
y = labeled_sample['severity']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.97      1.00      0.99       170
           1       0.00      0.00      0.00         3
           2       1.00      0.94      0.97        31
           3       1.00      1.00      1.00        36

    accuracy                           0.98       240
   macro avg       0.74      0.73      0.74       240
weighted avg       0.97      0.98      0.97       240



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
