# Laboratorio #4 – Familias de Malware
### Jose Hernandez 20053
### Javier Mombiela 20067

## Parte 1 
### Creacion del dataset

In [29]:
# importando librerias
import os
import pefile
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [30]:
# ruta del malware
malware_dir = './MALWR'
malware_data = []

# iteramos sobre los archivos de malware
for malware in os.listdir(malware_dir):
    malwarepath = os.path.join(malware_dir, malware)
    
    if os.path.isfile(malwarepath):
        # analisis con pefile
        try:
            pe = pefile.PE(malwarepath)

            # Si el archivo no es un PE, se ignora
            if not pe.is_exe():
                continue
            
            # encontrando las caracteristicas del malware
            characteristics = pe.FILE_HEADER.Characteristics
            machine = pe.FILE_HEADER.Machine
            number_of_sections = pe.FILE_HEADER.NumberOfSections
            sections = [section.Name.decode().strip('\x00') for section in pe.sections]

            # funciones
            imports = []
            if hasattr(pe, 'DIRECTORY_ENTRY_IMPORT'):
                for entry in pe.DIRECTORY_ENTRY_IMPORT:
                    for imp in entry.imports:
                        imports.append(imp.name)

            # recursos
            resources = []
            if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
                for entry in pe.DIRECTORY_ENTRY_RESOURCE.entries:
                    resources.append(entry.name)

            # firmas digitales
            digital_signatures = pe.OPTIONAL_HEADER.DATA_DIRECTORY[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_SECURITY']].Size != 0

            # agregamos todas las caracteristicas a la lista de malware
            malware_data.append({
                'Filename': malware,
                "Machine": machine,
                "Characteristics": characteristics,
                "Number_of_sections": number_of_sections,
                "Sections": sections,
                "Imports": imports,
                "Resources": resources,
                "Digital_signatures": digital_signatures
            })

        except Exception as e:
            print(f"Error al analizar '{malware}': {e}")

Error al analizar '.DS_Store': 'DOS Header magic not found.'


In [31]:
# creamos un pandas dataset a partir de la lista de malware
df = pd.DataFrame(malware_data)
df.to_csv("malware.csv", index=False)

df.head()

Unnamed: 0,Filename,Machine,Characteristics,Number_of_sections,Sections,Imports,Resources,Digital_signatures
0,785003A405BC7A4EBCBB21DDB757BF3F,332,271,3,"[UPX0, UPX1, .rsrc]","[b'LoadLibraryA', b'ExitProcess', b'GetProcAdd...",[None],False
1,1F2EB7B090018D975E6D9B40868C94CA,332,271,3,"[UPX0, UPX1, .rsrc]","[b'LoadLibraryA', b'ExitProcess', b'GetProcAdd...",[None],False
2,FGJKJJ1_2BA0D0083976A5C1E3315413CDCFFCD2,332,271,3,"[UPX0, UPX1, .rsrc]","[b'LoadLibraryA', b'ExitProcess', b'GetProcAdd...",[None],False
3,RTC_7F85D7F628CE62D1D8F7B39D8940472,332,271,3,"[UPX0, UPX1, .rsrc]","[b'LoadLibraryA', b'ExitProcess', b'GetProcAdd...",[None],False
4,JH78C0A33A1B472A8C16123FD696A5CE5EBB,332,271,3,"[UPX0, UPX1, .rsrc]","[b'LoadLibraryA', b'ExitProcess', b'GetProcAdd...",[None],False


### Exploracion y pre procesamiento de datos

In [32]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Filename            35 non-null     object
 1   Machine             35 non-null     int64 
 2   Characteristics     35 non-null     int64 
 3   Number_of_sections  35 non-null     int64 
 4   Sections            35 non-null     object
 5   Imports             35 non-null     object
 6   Resources           35 non-null     object
 7   Digital_signatures  35 non-null     bool  
dtypes: bool(1), int64(3), object(4)
memory usage: 2.1+ KB
None


In [33]:
print(df.describe())

       Machine  Characteristics  Number_of_sections
count     35.0             35.0           35.000000
mean     332.0            271.0            3.028571
std        0.0              0.0            0.169031
min      332.0            271.0            3.000000
25%      332.0            271.0            3.000000
50%      332.0            271.0            3.000000
75%      332.0            271.0            3.000000
max      332.0            271.0            4.000000


In [34]:
print(df.isnull().sum())

Filename              0
Machine               0
Characteristics       0
Number_of_sections    0
Sections              0
Imports               0
Resources             0
Digital_signatures    0
dtype: int64


In [35]:
# Convierte las listas en strings
df['Sections'] = df['Sections'].apply(lambda x: ', '.join(x))
df['Imports'] = df['Imports'].apply(lambda x: ', '.join([i.decode('utf-8') for i in x if i is not None]))
df['Resources'] = df['Resources'].apply(lambda x: ', '.join([i.decode('utf-8') for i in x if i is not None]))

df = pd.get_dummies(df)

scaler = StandardScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

Unnamed: 0,Machine,Characteristics,Number_of_sections,Digital_signatures,Filename_1F2EB7B090018D975E6D9B40868C94CA,Filename_33DE5067A433A6EC5C328067DC18EC37,Filename_65018CD542145A3792BA09985734C12A,Filename_650A6FCA433EE243391E4B4C11F09438,Filename_6FAA4740F99408D4D2DDDD0B09BBDEFD,Filename_785003A405BC7A4EBCBB21DDB757BF3F,...,"Sections_UPX0, UPX1, .rsrc","Sections_UPX0, UPX1, UPX2","Imports_CreateFileA, LocalAlloc, Sleep, CreateThread, CreateMutexA, CopyFileW, GetFileSize, CreateProcessA, GetEnvironmentVariableW, GetShortPathNameW, GetStartupInfoA, GetModuleHandleA, ReadFile, LocalFree, GetLastError, GetModuleFileNameW, CloseHandle, SendMessageA, FindWindowExA, RegQueryValueExA, RegCloseKey, RegSetValueExW, RegOpenKeyExA, SHGetSpecialFolderPathW, ShellExecuteW, _controlfp, _except_handler3, __set_app_type, memset, __CxxFrameHandler, strlen, sprintf, memcpy, strcpy, strcat, _mbsnbcpy, _mbsnbcmp, atol, _mbscmp, atoi, fclose, fwrite, fopen, strstr, wcslen, wcstombs, setlocale, wcscmp, wcscat, mbstowcs, wcsrchr, wcscpy, getenv, strcmp, time, free, __dllonexit, _onexit, _exit, _XcptFilter, exit, _acmdln, __getmainargs, _initterm, __setusermatherr, _adjust_fdiv, __p__commode, __p__fmode, _strnicmp, HttpAddRequestHeadersA, HttpOpenRequestA, HttpEndRequestA, InternetWriteFile, HttpSendRequestA, InternetReadFile, HttpQueryInfoA, InternetAttemptConnect, InternetCloseHandle, InternetConnectA, InternetOpenA, HttpSendRequestExA, gethostbyname, inet_ntoa, WSAStartup, gethostname","Imports_GetModuleHandleA, GetWindowsDirectoryA, GetModuleFileNameA, CopyFileA, OutputDebugStringA, CreateProcessA, CreateMutexA, CreateThread, Sleep, LocalAlloc, CreateFileA, GetFileSize, ReadFile, LocalFree, GetLastError, GetStartupInfoA, CloseHandle, SendMessageA, FindWindowExA, OpenSCManagerA, CreateServiceA, ChangeServiceConfig2A, OpenServiceA, CloseServiceHandle, StartServiceA, QueryServiceStatus, RegisterServiceCtrlHandlerA, SetServiceStatus, StartServiceCtrlDispatcherA, RegOpenKeyExA, RegQueryValueExA, RegCloseKey, _controlfp, memset, __CxxFrameHandler, strlen, sprintf, memcpy, strcpy, strcat, _mbsnbcpy, _mbsnbcmp, atol, _mbscmp, atoi, fclose, fwrite, fopen, strstr, getenv, strcmp, time, printf, free, __dllonexit, _onexit, _exit, _XcptFilter, exit, _acmdln, __getmainargs, _initterm, __setusermatherr, _adjust_fdiv, __p__commode, __p__fmode, __set_app_type, _except_handler3, _strnicmp, HttpAddRequestHeadersA, HttpOpenRequestA, HttpEndRequestA, InternetWriteFile, HttpSendRequestExA, InternetReadFile, HttpSendRequestA, InternetCloseHandle, InternetOpenA, InternetAttemptConnect, InternetConnectA, HttpQueryInfoA, gethostbyname, inet_ntoa, WSAStartup, gethostname","Imports_LoadLibraryA, ExitProcess, GetProcAddress, VirtualProtect, _iob, ShellExecuteA, InternetOpenA, inet_ntoa","Imports_LoadLibraryA, ExitProcess, GetProcAddress, VirtualProtect, atol, LoadStringA, send","Imports_LoadLibraryA, ExitProcess, GetProcAddress, VirtualProtect, atol, SHChangeNotify, LoadStringA, closesocket","Imports_RegSaveKeyA, BitBlt, LoadLibraryA, ExitProcess, GetProcAddress, VirtualProtect, _iob, Netbios, EnumProcessModules, GetDC, send","Imports_RegSaveKeyA, BitBlt, LoadLibraryA, ExitProcess, GetProcAddress, VirtualProtect, exit, Netbios, atoi, EnumProcessModules, GetDC, WSAGetLastError",Resources_
0,0.0,0.0,-0.171499,0.0,-0.171499,-0.171499,-0.171499,-0.171499,-0.171499,5.830952,...,0.677003,-0.306186,-0.5,-0.171499,-0.306186,-0.454859,1.154701,-0.171499,-0.246183,0.0
1,0.0,0.0,-0.171499,0.0,5.830952,-0.171499,-0.171499,-0.171499,-0.171499,-0.171499,...,0.677003,-0.306186,-0.5,-0.171499,-0.306186,2.198484,-0.866025,-0.171499,-0.246183,0.0
2,0.0,0.0,-0.171499,0.0,-0.171499,-0.171499,-0.171499,-0.171499,-0.171499,-0.171499,...,0.677003,-0.306186,-0.5,-0.171499,-0.306186,-0.454859,1.154701,-0.171499,-0.246183,0.0
3,0.0,0.0,-0.171499,0.0,-0.171499,-0.171499,-0.171499,-0.171499,-0.171499,-0.171499,...,0.677003,-0.306186,-0.5,-0.171499,3.265986,-0.454859,-0.866025,-0.171499,-0.246183,0.0
4,0.0,0.0,-0.171499,0.0,-0.171499,-0.171499,-0.171499,-0.171499,-0.171499,-0.171499,...,0.677003,-0.306186,-0.5,-0.171499,-0.306186,-0.454859,1.154701,-0.171499,-0.246183,0.0
